目录

本系列正在更新,点击下方查看


【Elasticsearch】使用IMDB学习ES(1)准备数据集【Elasticsearch】使用IMDB学习ES(2)docker搭建环境【Elasticsearch】使用IMDB准备ES学习用数据集(3) 设计映射结构

导入初始数据集

在上一篇文章中,我们设计了映射结构,这也是为了第一步准备的数据集的导入做准备。

在实际业务中,上线ES服务,我们会导入之前已经存在的业务数据,来保证版本上线之后的业务数据一致。

导入脚本

这里使用python作为导入数据的脚本语言。

脚本内容如下

import datetime
import logging
import time
import csv
from elasticsearch import Elasticsearch
import logging

log_file = datetime.datetime.fromtimestamp( time.time() ).strftime('%Y-%m-%d')
logging.basicConfig(filename='./imdb_'+log_file+'.log',level=logging.DEBUG)

def read_from_tsv(file_path: str, column_names: list) -> list:
csv.register_dialect('tsv_dialect', delimiter='\t', quoting=csv.QUOTE_ALL)
with open(file_path, "r") as wf:
reader = csv.DictReader(wf, fieldnames=column_names, dialect='tsv_dialect')
for row in reader:
data = dict(row)
print(data)
if data['titleId'] == 'titleId':
continue
if data['titleId'] == '':
break
id = data['titleId'] + str(data['ordering'])
titleId = data['titleId']
ordering = int(str(data['ordering']))
title = data['title']
region = data['region'].replace("\n", "")
language = data['language'].replace("\n", "")
types = data['types'].replace("\n", "")
attributes = data['types'].replace("\n", "")
isOriginalTitle = bool(data['isOriginalTitle'])
insertToEs(id,titleId,ordering,title,region,language,types,attributes,isOriginalTitle)

csv.unregister_dialect('tsv_dialect')
return datas

def insertToEs(id,titleId,ordering,title,region,language,types,attributes,isOriginalTitle):
try:
data = {
'id':id,
'titleId':titleId,
'ordering':ordering,
'title':title,
'region':region,
'language':language,
'types':types.split(' '),
'attributes':attributes.split(' '),
'isOriginalTitle':isOriginalTitle
}
es.index(
index="imdb",
id=id,
doc_type="_doc",
body=data
)
logging.info("cursor:"+id)
except Exception as e:
print(e)

if __name__ == "__main__":
begin_time = time.time()
es = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=6000)

read_from_tsv('title.akas.tsv',['titleId','ordering','title','region','language','types','attributes','isOriginalTitle']);
print("done with: ",time.time()-begin_time)

检查数据

查看索引

我们可以在索引管理中直接查看

【Elasticsearch】使用IMDB学习ES(4)导入数据_elasticsearch

查询

我们可以直接在开发工具中进行操作,也可以实验性质的拼接查询条件,再然后在自己的系统中完成查询方法的封装。

GET imdb

{
"imdb" : {
"aliases" : { },
"mappings" : {
"properties" : {
"attributes" : {
"type" : "keyword"
},
"id" : {
"type" : "text"
},
"isOriginalTitle" : {
"type" : "boolean"
},
"language" : {
"type" : "keyword"
},
"ordering" : {
"type" : "integer"
},
"region" : {
"type" : "keyword"
},
"title" : {
"type" : "text"
},
"titleId" : {
"type" : "text"
},
"types" : {
"type" : "keyword"
}
}
},
"settings" : {
"index" : {
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"number_of_shards" : "1",
"provided_name" : "imdb",
"creation_date" : "1618961115453",
"number_of_replicas" : "1",
"uuid" : "nCG_YzsHQV-YmvJfOxOZZg",
"version" : {
"created" : "7120099"
}
}
}
}
}
GET imdb/_search
{
"query":{
"match":{
"title":"clown"
}
}
}


{
"took" : 747,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 41,
"relation" : "eq"
},
"max_score" : 9.891902,
"hits" : [
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00065201",
"_score" : 9.891902,
"_source" : {
"id" : "tt00065201",
"titleId" : "tt0006520",
"ordering" : 1,
"title" : "The Clown",
"region" : "US",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00136041",
"_score" : 9.891902,
"_source" : {
"id" : "tt00136041",
"titleId" : "tt0013604",
"ordering" : 1,
"title" : "Le clown",
"region" : "FR",
"language" : """\N""",
"types" : [
"imdbDisplay"
],
"attributes" : [
"imdbDisplay"
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00162122",
"_score" : 9.891902,
"_source" : {
"id" : "tt00162122",
"titleId" : "tt0016212",
"ordering" : 2,
"title" : "O clown",
"region" : "GR",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00177613",
"_score" : 9.891902,
"_source" : {
"id" : "tt00177613",
"titleId" : "tt0017761",
"ordering" : 3,
"title" : "The Clown",
"region" : "US",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00177614",
"_score" : 9.891902,
"_source" : {
"id" : "tt00177614",
"titleId" : "tt0017761",
"ordering" : 4,
"title" : "The Clown",
"region" : """\N""",
"language" : """\N""",
"types" : [
"original"
],
"attributes" : [
"original"
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00311621",
"_score" : 9.891902,
"_source" : {
"id" : "tt00311621",
"titleId" : "tt0031162",
"ordering" : 1,
"title" : "Clown Princes",
"region" : "US",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00356199",
"_score" : 9.891902,
"_source" : {
"id" : "tt00356199",
"titleId" : "tt0035619",
"ordering" : 9,
"title" : "Akrovatis clown",
"region" : "GR",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00000191",
"_score" : 8.570279,
"_source" : {
"id" : "tt00000191",
"titleId" : "tt0000019",
"ordering" : 1,
"title" : "The Clown Barber",
"region" : "GB",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00071241",
"_score" : 8.570279,
"_source" : {
"id" : "tt00071241",
"titleId" : "tt0007124",
"ordering" : 1,
"title" : "The New Clown",
"region" : "GB",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
},
{
"_index" : "imdb",
"_type" : "_doc",
"_id" : "tt00123942",
"_score" : 8.570279,
"_source" : {
"id" : "tt00123942",
"titleId" : "tt0012394",
"ordering" : 2,
"title" : "The Little Clown",
"region" : "US",
"language" : """\N""",
"types" : [
"""\N"""
],
"attributes" : [
"""\N"""
],
"isOriginalTitle" : true
}
}
]
}
}

总结

到此为止,基础的ES业务实践系列完结。

接下来的一段实践,我们将开始进行复杂的ES业务中对于技术细节的学习。