elasticsearch支持大table格式数据的搜索

一、问题源起

数据情况

TableMeta，保存table的元数据，通过fileId关联具体的GridFS文件；

id	name	creator	fileId
1	table1	mango	f1
2	table2	mango	f2

table内包含列名和具体的行数据；

不同类型的table，列的名字和数量都可能不同；

from fport to toport location

192.168.1.1 11 192.168.1.12 11 chaoyang

192.168.1.2 22 192.168.1.13 22 tongzhou

搜索要求

支持所有类型的table的搜索；

支持全字段的搜索；

只返回表内命中的行，并进行高亮；

二、开发环境

elasticsearch 6.8.12

java 12.0.2 2019-07-16

Java(TM) SE Runtime Environment (build 12.0.2+10)

Java HotSpot(TM) 64-Bit Server VM (build 12.0.2+10, mixed mode, sharing)

三、elastic search对array的支持情况

扁平化数组元素

默认情况下elastic search会将数组内部对象的字段进行扁平化处理，这样就会丢失掉元素的独立性。

直接index一个文档

PUT my_array_index/_doc/1

{

  "group" : "fans",

  "user" : [

    {

      "first" : "John",

      "last" :  "Smith"

    },

    {

      "first" : "Alice",

      "last" :  "White"

    }

  ]

}

{

    "_index":"my_array_index",

    "_type":"_doc",

    "_id":"1",

    "_version":1,

    "result":"created",

    "_shards":{

        "total":2,

        "successful":1,

        "failed":0

    },

    "_seq_no":0,

    "_primary_term":1

}

elastic search 内部会将文档转化为如下形式再进行索引

{

  "group" :        "fans",

  "user.first" : [ "alice", "john" ],

  "user.last" :  [ "smith", "white" ]

}

扁平化处理将所有数组元素对象的相同字段值合并到一起作为一个数组，这样就丢失了user.first和user.last之间的对应关系，类似下边的查询即使没有Alice Smith这个人也可以命中

GET my_index/_search

{

  "query": {

    "bool": {

      "must": [

        { "match": { "user.first": "Alice" }},

        { "match": { "user.last":  "Smith" }}

      ]

    }

  }

}

{

    "took":2,

    "timed_out":false,

    "_shards":{

        "total":5,

        "successful":5,

        "skipped":0,

        "failed":0

    },

    "hits":{

        "total":1,

        "max_score":0.5753642,

        "hits":[

            {

                "_index":"my_array_index",

                "_type":"_doc",

                "_id":"1",

                "_score":0.5753642,

                "_source":{

                    "group":"fans",

                    "user":[

                        {

                            "first":"John",

                            "last":"Smith"

                        },

                        {

                            "first":"Alice",

                            "last":"White"

                        }

                    ]

                }

            }

        ]

    }

}

使用nested数据类型文档化数组元素

elastic search内部提供了nested数据类型，可以将数组元素作为单独的隐藏的内部文档进行索引，从而保持文档之间的独立性；

将字段映射为nested类型

PUT my_nested_index

{

  "mappings": {

    "_doc": {

      "properties": {

        "user": {

          "type": "nested"

        }

      }

    }

  }

}

{

    "acknowledged":true,

    "shards_acknowledged":true,

    "index":"my_nested_index"

}

index文档

PUT my_nested_index/_doc/1

{

  "group" : "fans",

  "user" : [

    {

      "first" : "John",

      "last" :  "Smith"

    },

    {

      "first" : "Alice",

      "last" :  "White"

    }

  ]

}

{

    "_index":"my_nested_index",

    "_type":"_doc",

    "_id":"1",

    "_version":1,

    "result":"created",

    "_shards":{

        "total":2,

        "successful":1,

        "failed":0

    },

    "_seq_no":0,

    "_primary_term":1

}

elastic search提供了单独的nested query 来支持nested类型

GET my_nested_index/_search

{

  "query": {

    "nested": {

      "path": "user",

      "query": {

        "bool": {

          "must": [

            { "match": { "user.first": "Alice" }},

            { "match": { "user.last":  "Smith" }}

          ]

        }

      }

    }

  }

}

{

    "took":3,

    "timed_out":false,

    "_shards":{

        "total":5,

        "successful":5,

        "skipped":0,

        "failed":0

    },

    "hits":{

        "total":0,

        "max_score":null,

        "hits":[

        ]

    }

}

nested query提供了inner_hits类支持字段高亮，从高亮信息中可以看到，offset字段指出了命中了数组中的第几个元素；

GET my_nested_index/_search

{

  "query": {

    "nested": {

      "path": "user",

      "query": {

        "bool": {

          "should": [

            { "match": { "user.first": "Alice" }},

            { "match": { "user.last":  "smith" }}

          ]

        }

      },

      "inner_hits": {

        "highlight": {

          "fields": {

            "*": {}

          }

        }

      }

    }

  }

}

{

    "took":8,

    "timed_out":false,

    "_shards":{

        "total":5,

        "successful":5,

        "skipped":0,

        "failed":0

    },

    "hits":{

        "total":1,

        "max_score":0.6931472,

        "hits":[

            {

                "_index":"my_nested_index",

                "_type":"_doc",

                "_id":"1",

                "_score":0.6931472,

                "_source":{

                    "group":"fans",

                    "user":[

                        {

                            "first":"John",

                            "last":"Smith"

                        },

                        {

                            "first":"Alice",

                            "last":"White"

                        }

                    ]

                },

                "inner_hits":{

                    "user":{

                        "hits":{

                            "total":2,

                            "max_score":0.6931472,

                            "hits":[

                                {

                                    "_index":"my_nested_index",

                                    "_type":"_doc",

                                    "_id":"1",

                                    "_nested":{

                                        "field":"user",

                                        "offset":0

                                    },

                                    "_score":0.6931472,

                                    "_source":{

                                        "first":"John",

                                        "last":"Smith"

                                    },

                                    "highlight":{

                                        "user.last":[

                                            "<em>Smith</em>"

                                        ]

                                    }

                                },

                                {

                                    "_index":"my_nested_index",

                                    "_type":"_doc",

                                    "_id":"1",

                                    "_nested":{

                                        "field":"user",

                                        "offset":1

                                    },

                                    "_score":0.6931472,

                                    "_source":{

                                        "first":"Alice",

                                        "last":"White"

                                    },

                                    "highlight":{

                                        "user.first":[

                                            "<em>Alice</em>"

                                        ]

                                    }

                                }

                            ]

                        }

                    }

                }

            }

        ]

    }

}

总结

经过以上的研究可以看到，elastic search提供的nested数据类型基本满足我们的目标要求，接下来使用具体的table数据做进一步的研究；

四、使用nested数据类型索引Table数据

elastic search索引数据结构

字段名字	字段类型	描述
id	string	主键
name	string	table的名字
creator	string	创建者
content	(object) array	行数据数组

elastic search mapping

PUT tables

{

  "mappings": {

    "_doc": {

      "properties": {

        "id": {

          "type": "keyword"

        },

        "name": {

          "type": "keyword"

        },

        "creator": {

          "type": "keyword"

        },

        "content": {

          "type": "nested"

        }

      }

    }

  }

}

{

    "acknowledged": true,

    "shards_acknowledged": true,

    "index": "tables"

}

index 一个Table data

PUT tables/_doc/1

{

    "id":"1",

    "name":"table1",

    "creator":"mango",

    "content":[

        {

            "0":"192.168.1.1",

            "1":"11",

            "2":"192.168.1.12",

            "3":"11",

            "4":"chaoyang"

        },

        {

            "0":"192.168.1.2",

            "1":"22",

            "2":"192.168.1.13",

            "3":"22",

            "4":"tongzhou"

        },

        {

            "0":"192.168.3",

            "1":"33",

            "2":"192.168.1.14",

            "3":"33",

            "4":"daxing"

        }

    ]

}

{

    "_index":"tables",

    "_type":"_doc",

    "_id":"1",

    "_version":1,

    "result":"created",

    "_shards":{

        "total":2,

        "successful":1,

        "failed":0

    },

    "_seq_no":0,

    "_primary_term":1

}

search Table data

搜索所有列

限制只返回Table的元数据信息

限制只返回命中行的信息

返回命中行的高亮信息

post /tables/_search/

{

    "from":0,

    "size":20,

    "_source":{

        "excludes":[

            "content"

        ]

    },

    "query":{

        "nested":{

            "path":"content",

            "query":{

                "query_string":{

                    "fields":[

                        "content.*"

                    ],

                    "query":"tongzhou  192.168.1.1"

                }

            },

            "inner_hits":{

                "from":0,

                "size":2,

                "highlight":{

                    "fields":{

                        "*":{

                        }

                    }

                }

            }

        }

    }

}

{

    "took":19,

    "timed_out":false,

    "_shards":{

        "total":5,

        "successful":5,

        "skipped":0,

        "failed":0

    },

    "hits":{

        "total":1,

        "max_score":0.9808292,

        "hits":[

            {

                "_index":"tables",

                "_type":"_doc",

                "_id":"1",

                "_score":0.9808292,

                "_source":{

                    "creator":"mango",

                    "name":"table1",

                    "id":"1"

                },

                "inner_hits":{

                    "content":{

                        "hits":{

                            "total":2,

                            "max_score":0.9808292,

                            "hits":[

                                {

                                    "_index":"tables",

                                    "_type":"_doc",

                                    "_id":"1",

                                    "_nested":{

                                        "field":"content",

                                        "offset":0

                                    },

                                    "_score":0.9808292,

                                    "_source":{

                                        "0":"192.168.1.1",

                                        "1":"11",

                                        "2":"192.168.1.12",

                                        "3":"11",

                                        "4":"chaoyang"

                                    },

                                    "highlight":{

                                        "content.0":[

                                            "<em>192.168.1.1</em>"

                                        ]

                                    }

                                },

                                {

                                    "_index":"tables",

                                    "_type":"_doc",

                                    "_id":"1",

                                    "_nested":{

                                        "field":"content",

                                        "offset":1

                                    },

                                    "_score":0.9808292,

                                    "_source":{

                                        "0":"192.168.1.2",

                                        "1":"22",

                                        "2":"192.168.1.13",

                                        "3":"22",

                                        "4":"tongzhou"

                                    },

                                    "highlight":{

                                        "content.4":[

                                            "<em>tongzhou</em>"

                                        ]

                                    }

                                }

                            ]

                        }

                    }

                }

            }

        ]

    }

}