Hive将数据导入到ElasticSearch
0.环境准备
ElasticSearch
Hive
1.安装插件
1.下载对应版本
https://www.elastic.co/cn/downloads/hadoop
hive支持json
http://www.congiu.net/hive-json-serde/1.3.8/hdp23/json-serde-1.3.8-jar-with-dependencies.jar
http://www.congiu.net/hive-json-serde/1.3.8/hdp23/json-udf-1.3.8-jar-with-dependencies.jar
wget -r https://artifacts.elastic.co/downloads/elasticsearch-hadoop/elasticsearch-hadoop-7.17.1.zip
2.安装unzip工具
yum install -y unzip
3.解压文件
unzip elasticsearch-hadoop-7.17.1.zip
4.找到jar文件
cd elasticsearch-hadoop-7.17.1/dist/
ll
#只能添加这个,hadoop.jar不用添加
elasticsearch-hadoop-hive-7.17.1.jar
#/root/elasticsearch/elasticsearch-hadoop-7.17.1/dist/elasticsearch-hadoop-hive-7.17.1.jar
5.进入hive添加jar(add jar 仅对当前窗口有效)
hive
#add jar 包 仅对当前窗口有效,下次使用需重新添加
hive> add jar /root/elasticsearch/elasticsearch-hadoop-7.17.1/dist/elasticsearch-hadoop-hive-7.17.1.jar;
#Added [/root/elasticsearch/elasticsearch-hadoop-7.17.1/dist/elasticsearch-hadoop-hive-7.17.1.jar] to class path
#Added resources: [/root/elasticsearch/elasticsearch-hadoop-7.17.1/dist/elasticsearch-hadoop-hive-7.17.1.jar]
2.hive表映射
1.创建Hive辅助表
hive> create database 5ewb;
hive> use 5ewb;
hive> create table `inwb` ( `phone` bigint, `uid` bigint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
OK
Time taken: 0.69 seconds
2.导入数据
hive> use 5ewb;
OK
hive> load data local inpath '/root/shegongku/wb5e.txt' into table `inwb`;
Loading data to table 5ewb.user
OK
Time taken: 97.234 seconds
hive> select * from `inwb` limit 10;
OK
NULL NULL
15890981333 5350176154
15944850489 6057766172
17073799004 6547208199
18392710332 3754369810
18047430444 6444293239
13762520188 3866009977
18408812716 6134347857
18477461107 6031338428
13647595899 6796854079
#删除数据中的空行
hive> insert overwrite table `inwb` select * from `inwb` where phone is not null;
3.创建hive映射表
hive
hive> use 5ewb;
OK
hive> CREATE TABLE `outwb` ( `phone` bigint, `uid` bigint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES('es.resource' = 'wb/_doc',
'es.index.auto.create' = 'true',
'es.nodes' = 'http://10.107.116.11',
'es.port'='9200',
'es.http.timeout'='120m',
'es.nodes.wan.only'='true');
OK
Time taken: 0.252 seconds
3.导入数据
hive> use 5ewb;
OK
Time taken: 0.176 seconds
hive> insert overwrite table `outwb` select * from `inwb`;
评论 (0)