ElasticSearch学习笔记一


ElasticSearch学习笔记一


环境搭建

elasticsearch 环境搭建可参考

ElasticSearch版本基于7+

后续所有操作基于kibana的dev tool执行,

模拟帖子数据

_bulk

<code>POST _bulk
{ "index": { "_index":"forum" ,"_id": 1 }}
{ "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden": false, "postDate": "2017-01-01" }
{ "index": {"_index":"forum", "_id": 2 }}
{ "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden": false, "postDate": "2017-01-02" }
{ "index": {"_index":"forum", "_id": 3 }}
{ "articleID" : "JODL-X-1937-#pV7", "userID" : 2, "hidden": false, "postDate": "2017-01-01" }
{ "index": { "_index":"forum" ,"_id": 4 }}
{ "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden": true, "postDate": "2017-01-02" }

/<code>

查看结构

<code>GET /forum/_mapping

{
"forum" : {
"mappings" : {
"properties" : {
"articleID" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"hidden" : {

"type" : "boolean"
},
"postDate" : {
"type" : "date"
},
"userID" : {
"type" : "long"
}
}
}
}
}


/<code>

按ID检索信息

<code>GET /forum/_search
{
"query": {
"constant_score": {
"filter": {
"term": {
"userID": 1
}
}

}
}
}

# 展示信息

{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [

{
"_index" : "forum",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"articleID" : "XHDK-A-1293-#fJ3",
"userID" : 1,
"hidden" : false,
"postDate" : "2017-01-01"
}
},
{
"_index" : "forum",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"articleID" : "KDKE-B-9947-#kL5",
"userID" : 1,
"hidden" : false,
"postDate" : "2017-01-02"
}
}
]
}
}

# 查看隐藏信息
GET /forum/_search
{
"query" : {
"constant_score" : {
"filter" : {
"term" : {
"hidden" : false
}
}
}
}
}


# 信息ID
GET /forum/_search
{
"query" : {
"constant_score" : {
"filter" : {
"term" : {

"articleID" : "XHDK-A-1293-#fJ3"
}
}
}
}
}
GET /forum/_search
{
"query" : {
"constant_score" : {
"filter" : {
"term" : {
"articleID.keyword" : "XHDK-A-1293-#fJ3"
}
}
}
}
}

# articleID.keyword,是es最新版本内置建立的field,就是不分词的。
# 一次是articleID过来的时候,会建立两次索引,一次是自己本身,是要分词的,分词后放入倒排索引;
# 另外一次是基于articleID.keyword,不分词,保留256个字符最多,直接一个字符串放入倒排索引中。
/<code>

查看分词

默认是analyzed的text类型的field,建立倒排索引的时候,就会对所有的articleID分词,分词以后,原本的articleID就没有了,只有分词后的各个word存在于倒排索引中。 term,是不对搜索文本分词的,XHDK-A-1293-#fJ3 –> XHDK-A-1293-#fJ3;但是articleID建立索引的时候,XHDK-A-1293-#fJ3 –> xhdk,a,1293,fj3

<code>GET /forum/_analyze
{
"field": "articleID",
"text": "XHDK-A-1293-#fJ3"

}
/<code>

重建索引

<code>DELETE /forum

PUT /forum
{
"mappings": {
"properties": {
"articleID": {
"type": "keyword"
}

}
}
}
/<code>

多条件组合查询

  1. 搜索日期为2017-01-01或ID为XHDK-A-1293-#fJ3的信息 GET /forum/_search { "query": { "constant_score": { "filter": { "bool": { "should": [ {"term": { "postDate": "2017-01-01" }}, {"term": {"articleID": "XHDK-A-1293-#fJ3"}} ], "must_not": { "term": { "postDate": "2017-01-02" } } } } } } }
  2. 搜索ID为XHDK-A-1293-#fJ3或ID为JODL-X-1937-#pV7并且日期为2017-01-01的信息GET /forum/_search { "query": { "constant_score": { "filter": { "bool": { "should": [ { "term": { "articleID": "XHDK-A-1293-#fJ3" } }, { "bool": { "must": [ { "term":{ "articleID": "JODL-X-1937-#pV7" } }, { "term": { "postDate": "2017-01-01" } } ] } } ] } } } } }

模拟标签功能

<code>POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"tag" : ["java", "hadoop"]} }
{ "update": { "_id": "2"} }
{ "doc" : {"tag" : ["java"]} }
{ "update": { "_id": "3"} }
{ "doc" : {"tag" : ["hadoop"]} }
{ "update": { "_id": "4"} }
{ "doc" : {"tag" : ["java", "elasticsearch"]} }

# 验证
GET /forum/_search
{
"query": {
"constant_score": {
"filter": {

"terms": {
"articleID": [
"KDKE-B-9947-#kL5",
"QQPX-R-3956-#aD8"
]
}
}
}
}
}
/<code>

检索只有java的tag标签

<code># 查询出为包含了java的tag的信息
GET /forum/_search
{
"query": {
"constant_score": {
"filter": {
"terms": {
"tag":["java"]
}
}
}
}
}

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"tag_cnt" : 2} }
{ "update": { "_id": "2"} }
{ "doc" : {"tag_cnt" : 1} }
{ "update": { "_id": "3"} }
{ "doc" : {"tag_cnt" : 1} }
{ "update": { "_id": "4"} }
{ "doc" : {"tag_cnt" : 2} }

GET /forum/_search
{
"query": {
"constant_score": {
"filter": {
"bool": {
"must": [
{
"term": {
"tag_cnt": 1
}

},
{
"terms": {
"tag": [
"java"
]
}
}
]
}
}
}
}
}


/<code>

模拟查看次数

<code>POST /forum/_bulk
{"update":{"_id":"1"}}
{"doc":{"view_cnt":30}}
{"update":{"_id":"2"}}
{"doc":{"view_cnt":50}}
{"update":{"_id":"3"}}
{"doc":{"view_cnt":100}}
{"update":{"_id":"4"}}
{"doc":{"view_cnt":80}}


GET /forum/_search
{
"query": {
"constant_score": {
"filter": {
"range": {
"view_cnt": {
"gt": 30,
"lte": 60
}
}
}
}
}
}
/<code>

模拟最近1个月的日期

<code>
POST /forum/_bulk
{ "index": { "_id": 5 }}
{ "articleID" : "DHJK-B-1395-#Ky5", "userID" : 3, "hidden": false, "postDate": "2017-03-01", "tag": ["elasticsearch"], "tag_cnt": 1, "view_cnt": 10 }


GET /forum/_search
{
"query": {
"constant_score": {
"filter": {
"range": {
"postDate": {
"gte": "2017-01-20||-30d"
}
}
}
}
}
}
/<code>

模拟标题

<code>POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"title" : "this is java and elasticsearch blog"} }
{ "update": { "_id": "2"} }
{ "doc" : {"title" : "this is java blog"} }
{ "update": { "_id": "3"} }
{ "doc" : {"title" : "this is elasticsearch blog"} }
{ "update": { "_id": "4"} }
{ "doc" : {"title" : "this is java, elasticsearch, hadoop blog"} }
{ "update": { "_id": "5"} }
{ "doc" : {"title" : "this is spark blog"} }

# 标题中包含 java 或 spark 的帖子

GET /forum/_search
{
"query": {
"match": {
"title": "java spark"
}
}
}

# 标题中包含 java 和 elasticsearch 的帖子

GET /forum/_search
{
"query": {
"match": {
"title" : {
"query": "java elasticsearch",
"operator": "and"
}
}
}
}

# 包含java,elasticsearch,spark,hadoop,4个关键字中,至少3个帖子
GET /forum/_search
{
"query": {
"match": {
"title": {
"query": "java elasticsearch spark hadoop",
"minimum_should_match": "75%"
}
}
}
}

/<code>

相关度计算

must和should搜索对应的分数,加起来,除以must和should的总数

  • 排名第一:java,同时包含should中所有的关键字,hadoop,elasticsearch
  • 排名第二:java,同时包含should中的elasticsearch
  • 排名第三:java,不包含should中的任何关键字

should是可以影响相关度分数的

must是确保说,谁必须有这个关键字,同时会根据这个must的条件去计算出document对这个搜索条件的relevance score 在满足must的基础之上,should中的条件,不匹配也可以,但是如果匹配的更多,那么document的relevance score就会更高

<code>
GET /forum/_search
{
"query": {
"bool": {
"must": {
"match": {
"title": "java"
}
},
"must_not": {
"match": {
"title": "spark"
}
},
"should": [
{
"match": {
"title": "hadoop"
}
},
{
"match": {
"title": "elasticsearch"
}
}
]
}
}
}

默认情况下,should是可以不匹配任何一个的,如果没有must的话,那么should中必须至少匹配一个才可以
/<code>

term + bool实现的nultiword的底层分析

<code>
# 普通match如何转换为term+should

{
"match": { "title": "java elasticsearch"}
}

使用诸如上面的match query进行多值搜索的时候,es会在底层自动将这个match query转换为bool的语法
bool should,指定多个搜索词,同时使用term query

{
"bool": {
"should": [
{ "term": { "title": "java" }},
{ "term": { "title": "elasticsearch" }}
]
}
}


# and match如何转换为term+must

{
"match": {
"title": {
"query": "java elasticsearch",
"operator": "and"
}
}
}

{
"bool": {
"must": [
{ "term": { "title": "java" }},
{ "term": { "title": "elasticsearch" }}
]
}
}


# minimum_should_match
{
"match": {
"title": {

"query": "java elasticsearch hadoop spark",
"minimum_should_match": "75%"
}
}
}

{
"bool": {
"should": [
{ "term": { "title": "java" }},
{ "term": { "title": "elasticsearch" }},
{ "term": { "title": "hadoop" }},
{ "term": { "title": "spark" }}
],
"minimum_should_match": 3
}
}

/<code>

基于bootst的细粒度条件权重

需求:搜索标题中包含java的帖子,同时呢,如果标题中包含hadoop或elasticsearch就优先搜索出来,同时呢,如果一个帖子包含java hadoop,一个帖子包含java elasticsearch,包含hadoop的帖子要比elasticsearch优先搜索出来

<code>GET /forum/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "blog"
}
}
],
"should": [
{
"match": {
"title": {
"query": "java"
}
}
},

{
"match": {
"title": {
"query": "hadoop"
}
}
},{
"match": {
"title": {
"query": "elasticsearch"
}
}
},
{
"match": {
"title": {
"query": "spark",
"boost": 2
}
}
}
]
}
}
}
/<code>

多shard场景下relevance score不准确

  • 生产环境下,数据量大,尽可能实现均匀分配

数据量很大的话,其实一般情况下,在概率学的背景下,es都是在多个shard中均匀路由数据的,路由的时候根据_id,负载均衡 比如说有10个document,title都包含java,一共有5个shard,那么在概率学的背景下,如果负载均衡的话,其实每个shard都应该有2个doc,title包含java 如果说数据分布均匀的话,其实就没有刚才说的那个问题了

  • 测试环境下,将索引的primary shard设置为1个,number_of_shards=1,index settings

如果说只有一个shard,那么当然,所有的document都在这个shard里面,就没有这个问题了

  • 测试环境下,搜索附带search_type=dfs_query_then_fetch参数,会将local IDF取出来计算global IDF

计算一个doc的相关度分数的时候,就会将所有shard对的local IDF计算一下,获取出来,在本地进行global IDF分数的计算,会将所有shard的doc作为上下文来进行计算,也能确保准确性。但是production生产环境下,不推荐这个参数,因为性能很差。

dis_max实现best fields策略的多字段搜索

<code>
# 添加contetnt内容

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"content" : "i like to write best elasticsearch article"} }
{ "update": { "_id": "2"} }
{ "doc" : {"content" : "i think java is the best programming language"} }
{ "update": { "_id": "3"} }
{ "doc" : {"content" : "i am only an elasticsearch beginner"} }
{ "update": { "_id": "4"} }
{ "doc" : {"content" : "elasticsearch and hadoop are all very good solution, i am a beginner"} }
{ "update": { "_id": "5"} }
{ "doc" : {"content" : "spark is best big data solution based on scala ,an programming language similar to java"} }


GET /forum/_search
{
"query": {

"bool":{
"should": [
{"match": {
"title": "java elasticsearch"
}},
{
"match": {
"content": "java solution"
}
}
]
}
}
}

# 结果分析

期望的是doc5,结果是doc2,doc4排在了前面

计算每个document的relevance score:每个query的分数,乘以matched query数量,除以总query数量

算一下doc4的分数

{ "match": { "title": "java solution" }},针对doc4,是有一个分数的
{ "match": { "content": "java solution" }},针对doc4,也是有一个分数的

所以是两个分数加起来,比如说,1.1 + 1.2 = 2.3
matched query数量 = 2
总query数量 = 2

2.3 * 2 / 2 = 2.3

算一下doc5的分数

{ "match": { "title": "java solution" }},针对doc5,是没有分数的
{ "match": { "content": "java solution" }},针对doc5,是有一个分数的

所以说,只有一个query是有分数的,比如2.3

matched query数量 = 1
总query数量 = 2

2.3 * 1 / 2 = 1.15

doc5的分数 = 1.15 < doc4的分数 = 2.3

best fields策略,就是说,搜索到的结果,应该是某一个field中匹配到了尽可能多的关键词,被排在前面;而不是尽可能多的field匹配到了少数的关键词,排在了前面



GET /forum/_search
{
"query": {
"dis_max": {
"queries": [
{ "match": { "title": "java solution" }},
{ "match": { "content": "java solution" }}
]
}
}
}

/<code>

tie_breaker优化dis_max

可能在实际场景中出现的一个情况是这样的:

(1)某个帖子,doc1,title中包含java,content不包含java beginner任何一个关键词 (2)某个帖子,doc2,content中包含beginner,title中不包含任何一个关键词 (3)某个帖子,doc3,title中包含java,content中包含beginner (4)最终搜索,可能出来的结果是,doc1和doc2排在doc3的前面,而不是我们期望的doc3排在最前面

dis_max,只是取分数最高的那个query的分数而已,tie_breaker参数的意义,在于说,将其他query的分数,乘以tie_breaker,然后综合与最高分数的那个query的分数,综合在一起进行计算(tie_breaker的值,在0~1之间,是个小数)

<code>GET /forum/_search
{
"query": {
"dis_max": {

"queries": [
{
"match": {
"title": "java solution"
}
},
{
"match": {
"content": "java solution"
}
}

],
"tie_breaker": 0.1
}
}
}
/<code>

multi_math & minimum_should_match

minimum_should_match 去长尾

<code>GET /forum/_search
{
"query": {
"multi_match": {
"query": "java solution",
"fields": ["title","content"],
"type": "best_fields",
"tie_breaker": 0.3,
"minimum_should_match": "50%"
}
}
}
/<code>

most-fields策略

  • best-fields策略,主要是说将某一个field匹配尽可能多的关键词的doc优先返回回来
  • most-fields策略,主要是说尽可能返回更多field匹配到某个关键词的doc,优先返回回来PUT /forum/_mapping { "properties": { "sub_title": { "type": "text", "analyzer": "english", "fields": { "std": { "type": "text", "analyzer": "standard" } } } } } POST /forum/_bulk { "update": { "_id": "1"} } { "doc" : {"sub_title" : "learning more courses"} } { "update": { "_id": "2"} } { "doc" : {"sub_title" : "learned a lot of course"} } { "update": { "_id": "3"} } { "doc" : {"sub_title" : "we have a lot of fun"} } { "update": { "_id": "4"} } { "doc" : {"sub_title" : "both of them are good"} } { "update": { "_id": "5"} } { "doc" : {"sub_title" : "haha, hello world"} }

cross-fields

cross-fields搜索,一个唯一标识,跨了多个field

跨多个field搜索一个标识,比如搜索一个人名,或者一个地址,就是cross-fields搜索

<code>
POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"author_first_name" : "Peter", "author_last_name" : "Smith"} }
{ "update": { "_id": "2"} }
{ "doc" : {"author_first_name" : "Smith", "author_last_name" : "Williams"} }
{ "update": { "_id": "3"} }
{ "doc" : {"author_first_name" : "Jack", "author_last_name" : "Ma"} }
{ "update": { "_id": "4"} }
{ "doc" : {"author_first_name" : "Robbin", "author_last_name" : "Li"} }
{ "update": { "_id": "5"} }
{ "doc" : {"author_first_name" : "Tonny", "author_last_name" : "Peter Smith"} }


GET /forum/_search
{
"query": {
"multi_match": {
"query": "Peter Smith",
"fields": ["author_first_name", "author_last_name"],
"type": "cross_fields"
}
}
}

GET /forum/_search
{
"query": {
"multi_match": {
"query": "Peter Smith",
"type": "cross_fields",
"operator": "and",
"fields": ["author_first_name", "author_last_name"]
}
}
}

/<code>

copy_to

将多个field组合成一个field

<code>
PUT /forum/_mapping
{
"properties": {
"new_author_full_name": {
"type": "text"
},
"new_author_first_name": {
"type": "text",
"copy_to": "new_author_full_name"
},
"new_author_last_name": {
"type": "text",
"copy_to": "new_author_full_name"
}

}
}

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"new_author_first_name" : "Peter", "new_author_last_name" : "Smith"} }\t
{ "update": { "_id": "2"} }\t
{ "doc" : {"new_author_first_name" : "Smith", "new_author_last_name" : "Williams"} }
{ "update": { "_id": "3"} }
{ "doc" : {"new_author_first_name" : "Jack", "new_author_last_name" : "Ma"} }
{ "update": { "_id": "4"} }
{ "doc" : {"new_author_first_name" : "Robbin", "new_author_last_name" : "Li"} }
{ "update": { "_id": "5"} }
{ "doc" : {"new_author_first_name" : "Tonny", "new_author_last_name" : "Peter Smith"} }\t

/<code>

近似匹配

phrase match,proximity match:短语匹配,近似匹配

<code>
# match_phrase语法

GET /forum/_search
{
"query": {
"match_phrase": {
"content": "big data"
}
}
}
只有包含java spark这个短语的doc才返回了,只包含java的doc不会返回


# 测试分词
GET _analyze
{
"text": "hello world, java spark",
"analyzer": "standard"
}


# 中间隔一个短语
GET /forum/_search
{
"query": {
"match_phrase": {
"content": {
"query": "big solution",
"slop": 1
}
}
}
}

slop:最多可以移动几次,获取此短语

/<code>

优先满足召回率

优先满足召回率,比如,java spark,包含java的也返回,包含spark的也返回,包含java和spark的也返回;同时兼顾精准度,就是包含java和spark,同时java和spark离的越近的doc排在最前面

<code>
GET /forum/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"content": "java spark"
}
}
],
"should": [
{
"match_phrase": {
"content": {
"query": "java spark",
"slop":50
}
}
}
]
}
}
}

/<code>

rescoring机制(重新打分)

默认情况下,match也许匹配了1000个doc,proximity match全都需要对每个doc进行一遍运算,判断能否slop移动匹配上,然后去贡献自己的分数

但是很多情况下,match出来也许1000个doc,其实用户大部分情况下是分页查询的,所以可能最多只会看前几页,比如一页是10条,最多也许就看5页,就是50条

proximity match只要对前50个doc进行slop移动去匹配,去贡献自己的分数即可,不需要对全部1000个doc都去进行计算和贡献分数

<code>
GET /forum/_search
{
"query": {
"match": {
"content": "java spark"
}
},
"rescore": {
"query": {
"rescore_query": {
"match_phrase": {
"content": {
"query": "java spark",
"slop": 50
}
}
}
},
"window_size": 50
}
}
/<code>


分享到:


相關文章: