BlueXIII's Blog

热爱技术,持续学习

0%

ES至ES数据迁移

参考文档

主机

索引梳理

名称 UUID Lucene 文档 储存空间
ci_thesis OJeVZnhkSJiMTNw0TmViuA 24503853 84 GB
cnki-data S8JyLIldR4C3ikH2jq_qUQ 12432339 48.8 GB
elantender vKC7Iv8lQQeO1i3iYCT-PA 307 6.66 MB
elantender2 hfut5RfWSo-2VT8S72ytxw 2564581 58.5 GB
elantender3 XQ-qAY_NSjuYK2b17FUkmg 800 16 MB
ieee-data K3juwXaKQVW-0J72NiJ5Pw 2774971 32.1 GB
lunwen-cn 7bQJ8sE4Tvi5L8aBQvfzeg 12340949 47.3 GB
policies AxRTd-uVQ_q6C4iw9l9soA 472817 13.9 GB
policies_hydrabot EO4sAmmVQ2m_-PyczXOczQ 2099896 92.8 GB
rc_patents2 DBSI6NwEQyaf92Kscl0ciw 25016403 132 GB
springer-data SjopLNkNRXmEQWfGw5B2LA 2416686 29.4 GB

优化

vi /etc/logstash/logstash.yml

1
2
pipeline.batch.size: 1000
pipeline.batch.delay: 500

vi /etc/logstash/jvm.options

1
2
-Xms8g
-Xmx8g

模板1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
input {
elasticsearch {
hosts => "10.20.41.30:9200"
index => "policies_hydrabot"
query => '{"query": {"range": {"createTime": {"gte": "2021-1-1 00:00:00","lt": "2021-10-1 00:00:00"}}}}'
docinfo => true
size => 1000
scroll => "1m"
}
}

filter {
mutate {
remove_field => ["@timestamp", "@version"]
}
}

output {
elasticsearch {
hosts => ["http://10.194.98.4:9200"]
index => "%{[@metadata][_index]}"
document_type => "%{[@metadata][_type]}"
document_id => "%{[@metadata][_id]}"
}
}

模板2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
input {
elasticsearch {
hosts => ["10.20.41.30:9200"]
index => "rc_patents2"
docinfo => true
size => 1000
scroll => "5m"
request_timeout_seconds => 600
socket_timeout_seconds => 600
}
}

output {
elasticsearch {
hosts => ["http://10.194.98.4:9200"]
index => "%{[@metadata][_index]}"
document_type => "%{[@metadata][_type]}"
document_id => "%{[@metadata][_id]}"
}
}

模板3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
input {
elasticsearch {
hosts => ["http://10.194.98.4:9200"]
index => "rc_patents2"
docinfo => true
size => 1000
scroll => "5m"
request_timeout_seconds => 600
socket_timeout_seconds => 600
}
}

output {
csv {
fields => ["applyDate","applyID","applyMan","host","id", "industry_category1_zh", "industry_category2_tag", "industry_category2_zh", "industry_jinan_names", "industry_jinan_tags", "inventor","mainID","openDate", "openID", "ownerAdd", "ownerCode", "path", "proxyCompany","proxyMan","status","summary","ten_power_names", "ten_power_tags", "ten_power_unionNames","title", "type","urlMd5"]
path => "/data/rc_patents2.csv"
}
}

执行

1
/data/logstash-7.15.2/bin/logstash "--path.settings" "/data/logstash-7.15.2/config" -f logstash.conf

分时间查询迁移

policies_hydrabot

分时段查询:

1
2
3
4
5
6
7
8
{"query": {"bool": {"must_not": {"exists": {"field": "createTime"}}}}}  41724 41717
{"query": {"range": {"createTime": {"lt": "2000-1-1 00:00:00"}}}} 1679 1679
{"query": {"range": {"createTime": {"gte": "2000-1-1 00:00:00","lt": "2020-1-1 00:00:00"}}}} 520994 520909 534371 525134
{"query": {"range": {"createTime": {"gte": "2020-1-1 00:00:00","lt": "2021-1-1 00:00:00"}}}} 578494 578452 585849 580347
{"query": {"range": {"createTime": {"gte": "2021-1-1 00:00:00","lt": "2021-10-1 00:00:00"}}}} 711246 711180 726068 715264
{"query": {"range": {"createTime": {"gte": "2021-10-1 00:00:00","lt": "2021-12-1 00:00:00"}}}} 269026 262501
{"query": {"range": {"createTime": {"gte": "2021-12-1 00:00:00"}}}} 67924 67913
{"query": {"range": {"ggsj": {"gt": "1638720000000"}}}}

踩坑

  • jvm.options参数中,将栈内存调到物理内存50%
  • logstash.yml参数中,将batch.size调大,降低请求频次,避免output es被写死
  • 使用7.X的logstash,可以向下兼容6.X的ES
  • 7.X的logstash,可以配置input的timeout参数,避免网络闪断造成的scroll重置,重复插入数据问题
  • 7.X的logstash,可以明显降低output es的lucene缓存占用