gitchennan/elasticsearch-analysis-lc-pinyin

配置参数少，功能满足需求。

对应版本

elasticsearch2.3.2 对应 elasticsearch-analysis-lc-pinyin 分支 2.4.2.1 或者 tag 2.2.2.1

创建一个类型

elasticsearch-analysis-lc-pinyin 的 README 是根据 elasticsearch5.0 编写的，给出的创建一个类型的语法如下

curl -XPOST http://localhost:9200/index/_mapping/brand -d'

{

  "brand": {

    "properties": {

      "name": {

        "type": "text",

        "analyzer": "lc_index",

        "search_analyzer": "lc_search",

        "term_vector": "with_positions_offsets"

      }

    }

  }

}'

type=text 是 elasticsearch5.0 之后的类型，所以无法创建成功，稍作修改 type=text，使用如下语法创建一个类型

curl -XPOST http://localhost:9200/index/_mapping/brand -d'

{

  "brand": {

    "properties": {

      "name": {

        "type": "string",

        "analyzer": "lc_index",

        "search_analyzer": "lc_search",

        "term_vector": "with_positions_offsets"

      }

    }

  }

}'

index 索引结构如下

{

  "index": {

    "aliases": {},

    "mappings": {

      "brand": {

        "properties": {

          "name": {

            "type": "string",

            "term_vector": "with_positions_offsets",

            "analyzer": "lc_index",

            "search_analyzer": "lc_search"

          }

        }

      }

    },

    "settings": {

      "index": {

        "creation_date": "1490152096129",

        "number_of_shards": "5",

        "number_of_replicas": "1",

        "uuid": "Lp1sSHGhQZyZ57LKO5KwRQ",

        "version": {

          "created": "2030299"

        }

      }

    },

    "warmers": {}

  }

}

存入几条数据

curl -XPOST http://localhost:9200/index/brand/1 -d'{"name":"百度"}'

curl -XPOST http://localhost:9200/index/brand/8 -d'{"name":"百度糯米"}'

curl -XPOST http://localhost:9200/index/brand/2 -d'{"name":"阿里巴巴"}'

curl -XPOST http://localhost:9200/index/brand/3 -d'{"name":"腾讯科技"}'

curl -XPOST http://localhost:9200/index/brand/4 -d'{"name":"网易游戏"}'

curl -XPOST http://localhost:9200/index/brand/9 -d'{"name":"大众点评"}'

curl -XPOST http://localhost:9200/index/brand/10 -d'{"name":"携程旅行网"}'

查出目前的所有数据

http://localhost:9200/index/_search

{

  "took": 70,

  "timed_out": false,

  "_shards": {

    "total": 5,

    "successful": 5,

    "failed": 0

  },

  "hits": {

    "total": 7,

    "max_score": 1,

    "hits": [

      {

        "_index": "index",

        "_type": "brand",

        "_id": "8",

        "_score": 1,

        "_source": {

          "name": "百度糯米"

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "9",

        "_score": 1,

        "_source": {

          "name": "大众点评"

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "10",

        "_score": 1,

        "_source": {

          "name": "携程旅行网"

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "2",

        "_score": 1,

        "_source": {

          "name": "阿里巴巴"

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "4",

        "_score": 1,

        "_source": {

          "name": "网易游戏"

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "1",

        "_score": 1,

        "_source": {

          "name": "百度"

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "3",

        "_score": 1,

        "_source": {

          "name": "腾讯科技"

        }

      }

    ]

  }

}

插件自带分词器 lc_index

原文：lc_index : 该分词器用于索引数据时指定，将中文转换为全拼和首字，同时保留中文

分词器分词效果

curl -X POST -d '{

  "analyzer" : "lc_index",

  "text" : ["刘德华"]

}' "http://localhost:9200/lc/_analyze"

{

  "tokens": [

    {

      "token": "刘",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "l",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "德",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "d",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "华",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "h",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    }

  ]

}

插件自带分词器 lc_search

原文：lc_search: 该分词器用于拼音搜索时指定，按最小拼音分词个数拆分拼音，优先拆分全拼

curl -X POST -d '{

  "analyzer" : "lc_search",

  "text" : ["刘德华"]

}' "http://localhost:9200/index/_analyze"

{

  "tokens": [

    {

      "token": "刘",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "德",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "华",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    }

  ]

}

拼音全拼

搜索 baidu，结果正确

curl -X POST -d '{

    "query": {

        "match": {

          "name": {

            "query": "baidu",

            "analyzer": "lc_search",

            "type": "phrase"

          }

        }

    },

    "highlight" : {

        "pre_tags" : ["<tag1>"],

        "post_tags" : ["</tag1>"],

        "fields" : {

            "name" : {}

        }

    }

}' "http://localhost:9200/index/brand/_search"

{

  "took": 4,

  "timed_out": false,

  "_shards": {

    "total": 5,

    "successful": 5,

    "failed": 0

  },

  "hits": {

    "total": 2,

    "max_score": 1.4054651,

    "hits": [

      {

        "_index": "index",

        "_type": "brand",

        "_id": "8",

        "_score": 1.4054651,

        "_source": {

          "name": "百度糯米"

        },

        "highlight": {

          "name": [

            "<tag1>百度</tag1>糯米"

          ]

        }

      },

      {

        "_index": "index",

        "_type": "brand",

        "_id": "1",

        "_score": 0.38356602,

        "_source": {

          "name": "百度"

        },

        "highlight": {

          "name": [

            "<tag1>百度</tag1>"

          ]

        }

      }

    ]

  }

}

单字拼音全拼与中文混合

搜索 xie程lu行，结果正确

{

  "took": 11,

  "timed_out": false,

  "_shards": {

    "total": 5,

    "successful": 5,

    "failed": 0

  },

  "hits": {

    "total": 1,

    "max_score": 2.459564,

    "hits": [

      {

        "_index": "index",

        "_type": "brand",

        "_id": "10",

        "_score": 2.459564,

        "_source": {

          "name": "携程旅行网"

        },

        "highlight": {

          "name": [

            "<tag1>携程旅行</tag1>网"

          ]

        }

      }

    ]

  }

}

单字拼音首字母与中文混合

搜索 携cl行，结果正确

curl -X POST -d '{

    "query": {

        "match": {

          "name": {

            "query": "携cl行",

            "analyzer": "lc_search",

            "type": "phrase"

          }

        }

    },

    "highlight" : {

        "pre_tags" : ["<tag1>"],

        "post_tags" : ["</tag1>"],

        "fields" : {

            "name" : {}

        }

    }

}' "http://localhost:9200/index/brand/_search"

{

  "took": 6,

  "timed_out": false,

  "_shards": {

    "total": 5,

    "successful": 5,

    "failed": 0

  },

  "hits": {

    "total": 1,

    "max_score": 2.459564,

    "hits": [

      {

        "_index": "index",

        "_type": "brand",

        "_id": "10",

        "_score": 2.459564,

        "_source": {

          "name": "携程旅行网"

        },

        "highlight": {

          "name": [

            "<tag1>携程旅行</tag1>网"

          ]

        }

      }

    ]

  }

}

拼音首字母

搜索 albb，结果正确

curl -X POST -d '{

    "query": {

        "match": {

          "name": {

            "query": "albb",

            "analyzer": "lc_search",

            "type": "phrase"

          }

        }

    },

    "highlight" : {

        "pre_tags" : ["<tag1>"],

        "post_tags" : ["</tag1>"],

        "fields" : {

            "name" : {}

        }

    }

}' "http://localhost:9200/index/brand/_search"

{

  "took": 4,

  "timed_out": false,

  "_shards": {

    "total": 5,

    "successful": 5,

    "failed": 0

  },

  "hits": {

    "total": 1,

    "max_score": 2.828427,

    "hits": [

      {

        "_index": "index",

        "_type": "brand",

        "_id": "2",

        "_score": 2.828427,

        "_source": {

          "name": "阿里巴巴"

        },

        "highlight": {

          "name": [

            "<tag1>阿里巴巴</tag1>"

          ]

        }

      }

    ]

  }

}

结论

elasticsearch-analysis-lc-pinyin 按照全拼、首字母，拼音中文混合搜索

elasticsearch-analysis-pinyin v1.7.2

github 项目 elasticsearch-analysis-pinyin v1.7.2 完全是为 elasticsearch 2.3.2 服务

first_letter 改变

first_letter=prefix padding_char=" "

curl -X POST -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "prefix",

						"padding_char" : " "

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl/_analyze"

{

  "tokens": [

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    },

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 1

    },

    {

      "token": "de",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "hua",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    }

  ]

}

first_letter=append padding_char=" "

curl -X POST -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "append",

						"padding_char" : " "

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl2"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl2/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    }

  ]

}

first_letter=only padding_char=" "

curl -X POST -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "only",

						"padding_char" : " "

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl3"

拼音效果如下

curl -X POST -H "Cache-Control: no-cache" -H "Postman-Token: 67015c0d-cd07-961b-4c46-da90f7d558d8" -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl3/_analyze"

{

  "tokens": [

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    }

  ]

}

first_letter=none padding_char=" "

curl -X POST -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "none",

						"padding_char" : " "

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl4"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl4/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 2

    }

  ]

}

padding_char 改变

first_letter=prefix padding_char=""

curl -X POST -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "prefix",

						"padding_char" : ""

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl5"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl5/_analyze"

{

  "tokens": [

    {

      "token": "ldhliudehua",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    }

  ]

}

first_letter=append padding_char=""

curl -X PUT -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "append",

						"padding_char" : ""

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl7"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl7/_analyze"

{

  "tokens": [

    {

      "token": "liudehualdh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    }

  ]

}

first_letter=only padding_char=""

curl -X PUT -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "only",

						"padding_char" : ""

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl8"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl8/_analyze"

{

  "tokens": [

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    }

  ]

}

first_letter=none padding_char=""

curl -X PUT -d '{

"mappings": {

		 "folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		 }

	  },

	"settings": {

		"index" : {

			"analysis" : {

				"analyzer" : {

					"pinyin_analyzer" : {

						"tokenizer" : "my_pinyin",

						"filter" : ["word_delimiter"]

					}

				},

				"tokenizer" : {

					"my_pinyin" : {

						"type" : "pinyin",

						"first_letter" : "none",

						"padding_char" : ""

					}

				}

			}

		}

	}

}' "http://localhost:9200/medcl9"

拼音效果如下

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl9/_analyze"

{

  "tokens": [

    {

      "token": "liudehua",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 0

    }

  ]

}

结论

elasticsearch 2.3.2 对应 elasticsearch-analysis-pinyin 1.7.2，pinyin 1.7.2 可配置参数有：first_letter 和 padding_char。
padding_char 的作用是将字符串按照什么字符分隔，比如 padding_char = " "，那么 刘德华 将先被分隔为 刘，德，华；如果 padding_char = ""，那么 刘德华 将不会被分隔
first_letter 取值：prefix，append，only，none。
padding_char 与 first_letter 的组合会影响拼音输出的结果

elasticsearch-analysis-pinyin 2.x 分支

github 项目 elasticsearch-analysis-pinyin 2.x 分支 是为 elasticsearch 2.x 服务，经过测试 elasticsearch 2.3.2 也可以使用该插件。

官方文档中的说明

remove_duplicated_term when this option enabled, duplicated term will be removed to save index, eg: de的>de, default: false, NOTE: position related query maybe influenced
keep_first_letter when this option enabled, eg: 刘德华>ldh, default: true
keep_separate_first_letter when this option enabled, will keep first letters separately, eg: 刘德华>l,d,h, default: false, NOTE: query result maybe too fuzziness due to term too frequency
limit_first_letter_length set max length of the first_letter result, default: 16
keep_full_pinyin when this option enabled, eg: 刘德华> [liu,de,hua], default: true
keep_joined_full_pinyin when this option enabled, eg: 刘德华> [liudehua], default: false
keep_none_chinese keep non chinese letter or number in result, default: true
keep_none_chinese_together keep non chinese letter together, default: true, eg: DJ音乐家 -> DJ,yin,yue,jia, when set to false, eg: DJ音乐家 -> D,J,yin,yue,jia, NOTE: keep_none_chinese should be enabled first
keep_none_chinese_in_first_letter keep non Chinese letters in first letter, eg: 刘德华AT2016->ldhat2016, default: true
none_chinese_pinyin_tokenize break non chinese letters into separate pinyin term if they are pinyin, default: true, eg: liudehuaalibaba13zhuanghan -> liu,de,hua,a,li,ba,ba,13,zhuang,han, NOTE: keep_none_chinese and keep_none_chinese_together should be enabled first
keep_original when this option enabled, will keep original input as well, default: false
lowercase lowercase non Chinese letters, default: true
trim_whitespace default: true

基准配置

基准配置参数

"keep_joined_full_pinyin": "false",

"lowercase": "true",

"keep_original": "false",

"keep_none_chinese_together": "true",

"remove_duplicated_term": "false",

"keep_first_letter": "true",

"keep_separate_first_letter": "false",

"trim_whitespace": "true",

"keep_none_chinese": "true",

"limit_first_letter_length": "16",

"keep_full_pinyin": "true"

创建索引与分词器

curl -X POST -d '{

	"mappings": {

		"folk": {

			"properties": {

			   "text": {

				  "type": "string",

				  "analyzer": "pinyin_analyzer"

			   }

			}

		}

	},

	"settings": {

			"index" : {

	        "analysis" : {

	            "analyzer" : {

	                "pinyin_analyzer" : {

	                    "tokenizer" : "my_pinyin"

	                    }

	            },

	            "tokenizer" : {

	                "my_pinyin" : {

	                    "type" : "pinyin",

          						"remove_duplicated_term" : false,

          						"keep_joined_full_pinyin" : false,

	                    "keep_separate_first_letter" : false,

          						"keep_first_letter" : true,

          						"limit_first_letter_length" : 16,

	                    "keep_full_pinyin" : true,

	                    "keep_original" : true,

	                    "keep_none_chinese" : true,

						          "keep_none_chinese_together" : true,

	                    "lowercase" : true,

						          "trim_whitespace" : true

	                }

	            }

	        }

	    }

	}

}' "http://localhost:9200/medcl20"

生成索引结构

curl -X GET "http://localhost:9200/medcl20"

{

  "medcl20": {

    "aliases": {},

    "mappings": {

      "folk": {

        "properties": {

          "text": {

            "type": "string",

            "analyzer": "pinyin_analyzer"

          }

        }

      }

    },

    "settings": {

      "index": {

        "creation_date": "1490170676090",

        "analysis": {

          "analyzer": {

            "pinyin_analyzer": {

              "tokenizer": "my_pinyin"

            }

          },

          "tokenizer": {

            "my_pinyin": {

              "keep_joined_full_pinyin": "false",

              "lowercase": "true",

              "keep_original": "true",

              "keep_none_chinese_together": "true",

              "remove_duplicated_term": "false",

              "keep_first_letter": "true",

              "keep_separate_first_letter": "false",

              "trim_whitespace": "true",

              "type": "pinyin",

              "keep_none_chinese": "true",

              "limit_first_letter_length": "16",

              "keep_full_pinyin": "true"

            }

          }

        },

        "number_of_shards": "5",

        "number_of_replicas": "1",

        "uuid": "31Y9PizQQ2KQn_Fl6bpPNw",

        "version": {

          "created": "2030299"

        }

      }

    },

    "warmers": {}

  }

}

分词器分词效果

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl20/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "刘德华",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    },

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 4

    }

  ]

}

keep_original

keep_original = true

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl20/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "刘德华",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    },

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 4

    }

  ]

}

keep_original = false

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl20/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    }

  ]

}

keep_original 功能

keep_original=true 将保留原字符串，比如存入索引的数据为 刘德华 那么 刘德华 将也会被保存到索引中。keep_original=false 则不保存原字符串到索引

trim_whitespace

trim_whitespace=true

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["   最爱   刘德华   的帅气帅气的   "]

}' "http://localhost:9200/medcl20/_analyze"

{

  "tokens": [

    {

      "token": "zui",

      "start_offset": 3,

      "end_offset": 4,

      "type": "word",

      "position": 0

    },

    {

      "token": "ai",

      "start_offset": 4,

      "end_offset": 5,

      "type": "word",

      "position": 1

    },

    {

      "token": "liu",

      "start_offset": 8,

      "end_offset": 9,

      "type": "word",

      "position": 2

    },

    {

      "token": "de",

      "start_offset": 9,

      "end_offset": 10,

      "type": "word",

      "position": 3

    },

    {

      "token": "hua",

      "start_offset": 10,

      "end_offset": 11,

      "type": "word",

      "position": 4

    },

    {

      "token": "de",

      "start_offset": 14,

      "end_offset": 15,

      "type": "word",

      "position": 5

    },

    {

      "token": "shuai",

      "start_offset": 15,

      "end_offset": 16,

      "type": "word",

      "position": 6

    },

    {

      "token": "qi",

      "start_offset": 16,

      "end_offset": 17,

      "type": "word",

      "position": 7

    },

    {

      "token": "shuai",

      "start_offset": 17,

      "end_offset": 18,

      "type": "word",

      "position": 8

    },

    {

      "token": "qi",

      "start_offset": 18,

      "end_offset": 19,

      "type": "word",

      "position": 9

    },

    {

      "token": "de",

      "start_offset": 19,

      "end_offset": 20,

      "type": "word",

      "position": 10

    },

    {

      "token": "最爱   刘德华   的帅气帅气的",

      "start_offset": 0,

      "end_offset": 23,

      "type": "word",

      "position": 11

    },

    {

      "token": "zaldhdsqsqd",

      "start_offset": 0,

      "end_offset": 11,

      "type": "word",

      "position": 12

    }

  ]

}

trim_whitespace=false

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["   最爱   刘德华   的帅气帅气的   "]

}' "http://localhost:9200/medcl20/_analyze"

{

  "tokens": [

    {

      "token": "zui",

      "start_offset": 3,

      "end_offset": 4,

      "type": "word",

      "position": 0

    },

    {

      "token": "ai",

      "start_offset": 4,

      "end_offset": 5,

      "type": "word",

      "position": 1

    },

    {

      "token": "liu",

      "start_offset": 8,

      "end_offset": 9,

      "type": "word",

      "position": 2

    },

    {

      "token": "de",

      "start_offset": 9,

      "end_offset": 10,

      "type": "word",

      "position": 3

    },

    {

      "token": "hua",

      "start_offset": 10,

      "end_offset": 11,

      "type": "word",

      "position": 4

    },

    {

      "token": "de",

      "start_offset": 14,

      "end_offset": 15,

      "type": "word",

      "position": 5

    },

    {

      "token": "shuai",

      "start_offset": 15,

      "end_offset": 16,

      "type": "word",

      "position": 6

    },

    {

      "token": "qi",

      "start_offset": 16,

      "end_offset": 17,

      "type": "word",

      "position": 7

    },

    {

      "token": "shuai",

      "start_offset": 17,

      "end_offset": 18,

      "type": "word",

      "position": 8

    },

    {

      "token": "qi",

      "start_offset": 18,

      "end_offset": 19,

      "type": "word",

      "position": 9

    },

    {

      "token": "de",

      "start_offset": 19,

      "end_offset": 20,

      "type": "word",

      "position": 10

    },

    {

      "token": "   最爱   刘德华   的帅气帅气的   ",

      "start_offset": 0,

      "end_offset": 23,

      "type": "word",

      "position": 11

    },

    {

      "token": "zaldhdsqsqd",

      "start_offset": 0,

      "end_offset": 11,

      "type": "word",

      "position": 12

    }

  ]

}

trim_whitespace 功能

去除字符串首尾空格字符，不去除字符串中间的空格。这个参数只有当 keep_original=true 时才能够看到效果。 例如当字符串为： 最爱刘德华的帅气帅气的 ，trim_whitespace=true 则原字符串将被保存为 最爱刘德华的帅气帅气的，如果 trim_whitespace=false 则原字符串将被保存为 最爱刘德华的帅气帅气的 。如果 keep_original=false，那么原字符串没有被保存，也将看不到效果。

keep_joined_full_pinyin

keep_joined_full_pinyin = false

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl21/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "刘德华",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    },

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 4

    }

  ]

}

keep_joined_full_pinyin = true

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华"]

}' "http://localhost:9200/medcl22/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "刘德华",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 3

    },

    {

      "token": "liudehua",

      "start_offset": 0,

      "end_offset": 8,

      "type": "word",

      "position": 4

    },

    {

      "token": "ldh",

      "start_offset": 0,

      "end_offset": 3,

      "type": "word",

      "position": 5

    }

  ]

}

keep_joined_full_pinyin 功能

keep_joined_full_pinyin=true 将保存字符串拼音全拼，false 则不保存。例如，当 kepp_joined_full_pinyin=true 时，文本 刘德华 的拼音全拼 liudehua 将会被保留；当 keep_joined_full_pinyin=false 则全拼liudehua

remove_duplicated_term

remove_duplicated_term = false

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华刘德华帅帅帅，帅帅帅"]

}' "http://localhost:9200/medcl20/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "liu",

      "start_offset": 3,

      "end_offset": 4,

      "type": "word",

      "position": 3

    },

    {

      "token": "de",

      "start_offset": 4,

      "end_offset": 5,

      "type": "word",

      "position": 4

    },

    {

      "token": "hua",

      "start_offset": 5,

      "end_offset": 6,

      "type": "word",

      "position": 5

    },

    {

      "token": "shuai",

      "start_offset": 6,

      "end_offset": 7,

      "type": "word",

      "position": 6

    },

    {

      "token": "shuai",

      "start_offset": 7,

      "end_offset": 8,

      "type": "word",

      "position": 7

    },

    {

      "token": "shuai",

      "start_offset": 8,

      "end_offset": 9,

      "type": "word",

      "position": 8

    },

    {

      "token": "shuai",

      "start_offset": 10,

      "end_offset": 11,

      "type": "word",

      "position": 9

    },

    {

      "token": "shuai",

      "start_offset": 11,

      "end_offset": 12,

      "type": "word",

      "position": 10

    },

    {

      "token": "shuai",

      "start_offset": 12,

      "end_offset": 13,

      "type": "word",

      "position": 11

    },

    {

      "token": "刘德华刘德华帅帅帅，帅帅帅",

      "start_offset": 0,

      "end_offset": 13,

      "type": "word",

      "position": 12

    },

    {

      "token": "ldhldhssssss",

      "start_offset": 0,

      "end_offset": 12,

      "type": "word",

      "position": 13

    }

  ]

}

remove_duplicated_term = true

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华刘德华帅帅帅，帅帅帅"]

}' "http://localhost:9200/medcl26/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "shuai",

      "start_offset": 6,

      "end_offset": 7,

      "type": "word",

      "position": 3

    },

    {

      "token": "刘德华刘德华帅帅帅，帅帅帅",

      "start_offset": 0,

      "end_offset": 13,

      "type": "word",

      "position": 4

    },

    {

      "token": "ldhldhssssss",

      "start_offset": 0,

      "end_offset": 12,

      "type": "word",

      "position": 5

    }

  ]

}

remove_duplicated_term 功能

remove_duplicated_term=true 则会将文本中相同的拼音只保存一份，比如 刘德华刘德华 只会保留一份拼音 liu，de，hua；相对的 remove_duplicated_term=false 则会保留两份 liu，de，hua。注意：remove_duplicated_term 并不会影响文本首字母的文本，刘德华刘德华 生成的首字母拼音始终都为 ldhldh

remove_duplicated_term = true 并且 keep_joined_full_pinyin = true

curl -X POST -d '{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘德华刘德华帅帅帅，帅帅帅"]

}' "http://localhost:9200/medcl27/_analyze"

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 1,

      "end_offset": 2,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 2,

      "end_offset": 3,

      "type": "word",

      "position": 2

    },

    {

      "token": "shuai",

      "start_offset": 6,

      "end_offset": 7,

      "type": "word",

      "position": 3

    },

    {

      "token": "刘德华刘德华帅帅帅，帅帅帅",

      "start_offset": 0,

      "end_offset": 13,

      "type": "word",

      "position": 4

    },

    {

      "token": "liudehualiudehuashuaishuaishuaishuaishuaishuai",

      "start_offset": 0,

      "end_offset": 46,

      "type": "word",

      "position": 5

    },

    {

      "token": "ldhldhssssss",

      "start_offset": 0,

      "end_offset": 12,

      "type": "word",

      "position": 6

    }

  ]

}

remove_duplicated_term 功能

remove_duplicated_term = true 会过滤相同的拼音，但是不影响全拼，刘德华刘德华 生成的字符串全拼为 liudehualiudehua

keep_none_chinese

keep_none_chinese = true

POST /medcl20/_analyze HTTP/1.1

Host: localhost:9200

{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘*20*德b华DJ"]

}

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "20",

      "start_offset": 3,

      "end_offset": 5,

      "type": "word",

      "position": 1

    },

    {

      "token": "de",

      "start_offset": 5,

      "end_offset": 6,

      "type": "word",

      "position": 2

    },

    {

      "token": "b",

      "start_offset": 6,

      "end_offset": 7,

      "type": "word",

      "position": 3

    },

    {

      "token": "hua",

      "start_offset": 7,

      "end_offset": 8,

      "type": "word",

      "position": 4

    },

    {

      "token": "d",

      "start_offset": 7,

      "end_offset": 9,

      "type": "word",

      "position": 5

    },

    {

      "token": "j",

      "start_offset": 7,

      "end_offset": 9,

      "type": "word",

      "position": 6

    },

    {

      "token": "刘*20*德b华dj",

      "start_offset": 0,

      "end_offset": 10,

      "type": "word",

      "position": 7

    },

    {

      "token": "l20dbhdj",

      "start_offset": 0,

      "end_offset": 8,

      "type": "word",

      "position": 8

    }

  ]

}

keep_none_chinese = false

POST /medcl28/_analyze HTTP/1.1

Host: localhost:9200

{

  "analyzer" : "pinyin_analyzer",

  "text" : ["刘*20*德b华DJ"]

}

{

  "tokens": [

    {

      "token": "liu",

      "start_offset": 0,

      "end_offset": 1,

      "type": "word",

      "position": 0

    },

    {

      "token": "de",

      "start_offset": 5,

      "end_offset": 6,

      "type": "word",

      "position": 1

    },

    {

      "token": "hua",

      "start_offset": 7,

      "end_offset": 8,

      "type": "word",

      "position": 2

    },

    {

      "token": "刘*20*德b华dj",

      "start_offset": 0,

      "end_offset": 10,

      "type": "word",

      "position": 3

    },

    {

      "token": "l20dbhdj",

      "start_offset": 0,

      "end_offset": 8,

      "type": "word",

      "position": 4

    }

  ]

}

keep_none_chinese 功能

keep_none_chinese = true 则非中文字母以及数字将会被保留，但是要确定所有的特别字符都是无法被保留下来的。例如，文本 刘*20*德b华dj 中的数字 20，字母 b 与 dj 将会被保留，而特殊字符 * 是不会保留的；当 keep_none_chinese=false 则非中文字母以及数字将不会被保留，上述文本中的数字 20，字母 b 与 dj 将不会被保留。注意：参数 keep_none_chinese 是不会影响首字母以及所有字符组成全拼的拼音，上述文本生成的首字母拼音为 l20dbhdj，所有字符组成的全拼为：liu20debhuadj，特别字符始终是被过滤去除的。

欢迎转载，请注明本文链接，谢谢你。

2017.4.12 20:44

elasticsearch 拼音检索能力研究的更多相关文章

Elasticsearch原理学习--为什么Elasticsearch/Lucene检索可以比MySQL快?
转载于:http://vlambda.com/wz_wvS2uI5VRn.html 同样都可以对数据构建索引并通过索引查询数据,为什么Lucene或基于Lucene的Elasticsearch会比关系 ...
ElasticSearch进阶检索
ElasticSearch进阶检索入门检索中讲了如何导入elastic提供的样本测试数据,下面我们用这些数据进一步检索一.SearchAPI ES 支持两种基本方式检索 : 1.一种是通过使用 R ...
MD5、拼音检索和邮件发送
MD5算法 MD5算法是一种散列(hash)算法(摘要算法,指纹算法),不是一种加密算法(易错) l 为了防止用户偷懒,算两次MD5值,或者加上一个固定的字符串 MD5算法理论上是不可逆的,因此攻击 ...
.NET 拼音检索
微软提供了一个Visual Studio International Pack 组件,可以转换简繁体,或者将汉字转换为拼音以及其他语言的支持. https://www.microsoft.com/zh ...
easyui combobox 拼音检索快捷选择输入
easyui combobox 拼音检索快捷选择输入效果如图 $.ajax({ url: UserActionUrl + '?action=listuserworktype', dataType ...
搭建ElasticSearch+MongoDB检索系统
ElasticSearch是一个基于Lucene的搜索服务器.它提供了一个分布式多用户能力的全文搜索引擎,基于RESTful web接口.Elasticsearch是用Java开发的,并作为Apach ...
分布式搜索elasticsearch 文献检索索引入门
1.首先,例如,下面的数据被提交给ES该指数 {"number":32768,"singer":"杨坤","size": ...
ES 19 - Elasticsearch的检索语法(_search API的使用)
目录 1 Search API的基本用法 1.1 查询所有数据 1.2 响应信息说明 1.3 timeout超时机制 1.4 查询多索引和多类型中的数据 2 URI Search的用法 2.1 GET ...
elasticsearch 拼音+ik分词，spring data elasticsearch 拼音分词
elasticsearch 自定义分词器安装拼音分词器.ik分词器拼音分词器: https://github.com/medcl/elasticsearch-analysis-pinyin/rel ...

随机推荐

【题解】JSOIWC2019 Round1
题面(T1变成5s(毒瘤出题人发现std超时了qaq)): 啥都不会qaq.但也送了不少分题解: T1: 当T=0时直接异或前缀和,但T=1时就有点恶心暴力能有80pts(防止大家爆零) 还珂以用 ...
FastClick用法
https://majing.io/posts/10000007721218 为什么要使用FastClick 移动设备上的浏览器默认会在用户点击屏幕大约延迟300毫秒后才会触发点击事件,这是为了检查用 ...
使用docker部署ambari的若干要点
ambari部署各个组件使用ambari进行部署时主要需要的组件包括: ambari-server: 主要部署的控制节点,负责控制agent进行部署. mysql: server存储的数据库.也支持 ...
Loj 6036 「雅礼集训 2017 Day4」编码 - 2-sat
题目传送门唯一的传送门题目大意给定$n$个串,每个串只包含 ' .问是否可能任意两个不同的串不满足一个是另一个的前缀. 2-sat的是显然的. 枚举每个通配符填0还是1,然后插入Trie树. 对 ...
HDU 6166 Senior Pan（k点中最小两点间距离）题解
题意:n个点,m条有向边,指定k个点,问你其中最近的两点距离为多少思路:这题的思路很巧妙,如果我们直接枚举两点做最短路那就要做C(k,2)次.但是我们换个思路,我们把k个点按照二进制每一位的0和1分 ...
C# 线程正确使用Thread.Join()停止方式
/// <summary> /// 停下线程 /// </summary> private void MyStopTask() ...
Java this关键字学习笔记
前言: 这篇博文就是系统的学习一下Java中的this关键字,本人对this关键字的理解知识简单的停留在对类的成员变量进行赋值,这次所以决定系统的体会一下this 关键字转自:https://b ...
vivado 创建PS工程
前言本文简要介绍在vivado中创建PS工程.单纯使用zynq芯片的PS部分就像使用普通ARM芯片一样,只是多了建立Zynq硬件系统这一个步骤.vivado创建PL工程参见此处新建工程与viva ...
CookieHelper
using System.Web: /// <summary> /// CookieHelper /// </summary> public static class Cook ...
Linux上发布.Net Core
环境准备下面我们使用VM虚拟机.我这里安装的Linux系统是centos7 软件提供: VM: https://www.vmware.com/cn.html centos7 Minimal :htt ...

elasticsearch 拼音检索能力研究

gitchennan/elasticsearch-analysis-lc-pinyin

对应版本

创建一个类型

插件自带分词器 lc_index

插件自带分词器 lc_search

拼音全拼

单字拼音全拼与中文混合

单字拼音首字母与中文混合

拼音首字母

结论

elasticsearch-analysis-pinyin v1.7.2

first_letter 改变

first_letter=prefix padding_char=" "

first_letter=append padding_char=" "

first_letter=only padding_char=" "

first_letter=none padding_char=" "

padding_char 改变

first_letter=prefix padding_char=""

first_letter=append padding_char=""

first_letter=only padding_char=""

first_letter=none padding_char=""

结论

elasticsearch-analysis-pinyin 2.x 分支

官方文档中的说明

基准配置

keep_original

keep_original = true

keep_original = false

keep_original 功能

trim_whitespace

trim_whitespace=true

trim_whitespace=false

trim_whitespace 功能

keep_joined_full_pinyin

keep_joined_full_pinyin = false

keep_joined_full_pinyin = true

keep_joined_full_pinyin 功能

remove_duplicated_term

remove_duplicated_term = false

remove_duplicated_term = true

remove_duplicated_term 功能

remove_duplicated_term = true 并且 keep_joined_full_pinyin = true

remove_duplicated_term 功能

keep_none_chinese

keep_none_chinese = true

keep_none_chinese = false

keep_none_chinese 功能

elasticsearch 拼音检索能力研究的更多相关文章

随机推荐

热门专题