Java_Hive_UDF函数清洗数据_清洗出全国的省份数据
最近用Hadoop
搞数据清洗,需要根据原始的地区数据清洗出对应的省份数据,当然我这里主要清洗的是内陆地区的数据,原始数据中不包含港澳台地区的数据,用了最简单直接的方式解决,这种方式思路很简单就是简单的归类可以参考一下,但是就是费事,要找全国的地区数据:
import org.apache.hadoop.hive.ql.exec.UDF;
public class Hive_Location extends UDF{
//UDF必须调用 evaluate 方法
public String evaluate(String hive_line) {
//hive_line 代表从hive读入的字段值
//直辖市 :北京市、上海市、天津市、重庆市
if(hive_line.contains("北京")) {
return "北京";
}else if(hive_line.contains("上海")) {
return "上海";
}else if(hive_line.contains("天津")) {
return "天津";
}else if(hive_line.contains("重庆")) {
return "重庆";
}else {
/*省份 :河北省、山西省、辽宁省、吉林省、黑龙江省、江苏省、浙江省、安徽省、
福建省、江西省、山东省、河南省、湖北省、湖南省、广东省、海南省、四川省、贵州省、
云南省、陕西省、甘肃省、青海省、台湾省*/
//河北:石家庄市、唐山市、秦皇岛市、邯郸市、邢台市、保定市、张家口市、承德市、沧州市、廊坊市、衡水市
String[] hb= {"石家庄","唐山","秦皇岛","邯郸","邢台","保定","张家口","承德","沧州","廊坊","衡水"};
for(String s:hb) {
if(hive_line.contains(s)) {
return "河北";
}
}
//山西:太原、大同、朔州、忻州、阳泉、吕梁、晋中、长治、晋城、临汾、运城
String[] sx= {"太原","大同","朔州","忻州","阳泉","吕梁","晋中","长治","晋城","临汾","运城"};
for(String s:sx) {
if(hive_line.contains(s)) {
return "山西";
}
}
//辽宁:沈阳市、大连市、鞍山市、抚顺市、本溪市、丹东市、锦州市、营口市、阜新市、辽阳市、盘锦市、铁岭市、朝阳市、葫芦岛市
String[] ln= {"沈阳","大连","鞍山","抚顺","本溪","丹东","锦州","营口","阜新","辽阳","盘锦","铁岭","朝阳","葫芦岛"};
for(String s:ln) {
if(hive_line.contains(s)) {
return "辽宁";
}
}
//吉林省:长春市、吉林市、四平市、通化市、白城市、辽源市、松原市、白山市、延边朝鲜族自治州
String[] cc= {"长春","吉林","四平","通化","白城","辽源","松原","白山","延边朝鲜"};
for(String s:cc) {
if(hive_line.contains(s)) {
return "吉林";
}
}
//黑龙江省:哈尔滨市、齐齐哈尔市、牡丹江市、佳木斯市、大庆市、鸡西市、双鸭山市、伊春市、七台河市、鹤岗市、绥化市、黑河市、大兴安岭地区
String[] hlj= {"哈尔滨","齐齐哈尔","牡丹江","佳木斯","大庆","鸡西","双鸭山","伊春","七台河","鹤岗","绥化","黑河","大兴安岭"};
for(String s:hlj) {
if(hive_line.contains(s)) {
return "黑龙江";
}
}
//江苏省:南京市、无锡市、徐州市、常州市、苏州市、南通市、连云港市、淮安市、盐城市、扬州市、镇江市、泰州市、宿迁市
String[] nj= {"南京","无锡","徐州","常州","苏州","南通","连云港","淮安","盐城","扬州","镇江","泰州","宿迁"};
for(String s:nj) {
if(hive_line.contains(s)) {
return "南京";
}
}
//浙江省:杭州市、宁波市、温州市、嘉兴市、湖州市、绍兴市、金华市、衢州市、舟山市、台州市、丽水市
String[] zj= {"杭州","宁波","温州","嘉兴","湖州","绍兴","金华","衢州","舟山","台州","丽水"};
for(String s:zj) {
if(hive_line.contains(s)) {
return "浙江";
}
}
//安徽省:合肥市、芜湖市、蚌埠市、淮南市、马鞍山市、淮北市、铜陵市、安庆市、黄山市、阜阳市、宿州市、滁州市、六安市、宣城市、池州市、亳州市
String[] ah= {"合肥","芜湖","蚌埠","淮南","马鞍山","淮北","铜陵","安庆","黄山","阜阳","宿州","滁州","六安","宣城","池州","亳州"};
for(String s:ah) {
if(hive_line.contains(s)) {
return "安徽";
}
}
//福建省:福州市、厦门市、莆田市、三明市、泉州市、漳州市、南平市、龙岩市、宁德市
String[] fj= {"福州","厦门","莆田","三明","泉州","漳州","南平","龙岩","宁德"};
for(String s:fj) {
if(hive_line.contains(s)) {
return "福建";
}
}
//江西省:南昌市、景德镇市、萍乡市、九江市、抚州市、鹰潭市、赣州市、吉安市、宜春市、新余市、上饶市
String[] jx= {"南昌","景德镇","萍乡","九江","抚州","鹰潭","赣州","吉安","宜春","新余","上饶"};
for(String s:jx) {
if(hive_line.contains(s)) {
return "江西";
}
}
//山东省:济南市、青岛市、淄博市、枣庄市、东营市、烟台市、潍坊市、济宁市、泰安市、威海市、日照市、临沂市、德州市、聊城市、滨州市、菏泽市
String[] sd= {"济南","青岛","淄博","枣庄","东营","烟台","潍坊","济宁","泰安","威海","日照","临沂","德州","聊城","滨州","菏泽"};
for(String s:sd) {
if(hive_line.contains(s)) {
return "山东";
}
}
//河南省:郑州市、开封市、洛阳市、平顶山市、安阳市、鹤壁市、新乡市、焦作市、濮阳市、许昌市、漯河市、三门峡市、南阳市、商丘市、信阳市、周口市、驻马店市
String[] hn= {"郑州","开封","洛阳","平顶山","安阳","鹤壁","新乡","焦作","濮阳","许昌","漯河","三门峡","南阳","商丘","信阳","周口","驻马店"};
for(String s:hn) {
if(hive_line.contains(s)) {
return "河南";
}
}
//湖北省:武汉市、黄石市、十堰市、宜昌市、襄阳市、鄂州市、荆门市、孝感市、荆州市、黄冈市、咸宁市、随州市
String[] wh= {"武汉","黄石","十堰","宜昌","襄阳","鄂州","荆门","孝感","荆州","黄冈","咸宁","随州"};
for(String s:wh) {
if(hive_line.contains(s)) {
return "武汉";
}
}
//湖南省:长沙市、株洲市、湘潭市、衡阳市、邵阳市、岳阳市、常德市、张家界市、益阳市、郴州市、永州市、怀化市、娄底市
String[] cs= {"长沙","株洲","湘潭","衡阳","邵阳","岳阳","常德","张家界","益阳","郴州","永州","怀化","娄底"};
for(String s:cs) {
if(hive_line.contains(s)) {
return "湖南";
}
}
//广东省:广州市、韶关市、深圳市、珠海市、汕头市、佛山市、江门市、湛江市、茂名市、肇庆市、惠州市、梅州市、汕尾市、河源市、阳江市、清远市、东莞市、中山市、潮州市、揭阳市、云浮市
String[] gd= {"广州","韶关","深圳","珠海","汕头","佛山","江门","湛江","茂名","肇庆","惠州","梅州","汕尾","河源","阳江","清远","东莞","中山","潮州","揭阳","云浮"};
for(String s:gd) {
if(hive_line.contains(s)) {
return "广东";
}
}
//广西壮族自治区:南宁市、柳州市、桂林市、梧州市、北海市、防城港市、钦州市、贵港市、玉林市、百色市、贺州市、河池市、来宾市、崇左市
String[] gx={"南宁","柳州","桂林","梧州","北海","防城港","钦州","贵港","玉林","百色","贺州","河池","来宾","崇左"};
for(String s:gx) {
if(hive_line.contains(s)) {
return "广西";
}
}
//海南省:海口市、三亚市、三沙市、儋州市
String[] hns= {"海口","三亚","三沙","儋州"};
for(String s:hns) {
if(hive_line.contains(s)) {
return "海南";
}
}
//四川省:成都市、自贡市、攀枝花市、泸州市、德阳市、绵阳市、广元市、遂宁市、内江市、乐山市、南充市、眉山市、宜宾市、广安市、达州市、雅安市、巴中市、资阳市
String [] sc= {"成都","自贡","攀枝花","泸州","德阳","绵阳","广元","遂宁","内江","乐山","南充","眉山","宜宾","广安","达州","雅安","巴中","资阳"};
for(String s:sc) {
if(hive_line.contains(s)) {
return "四川";
}
}
//贵州省:贵阳市、六盘水市、遵义市、安顺市、毕节市、铜仁市
String[] gz= {"贵阳","六盘水","遵义","安顺","毕节","铜仁"};
for(String s:gz) {
if(hive_line.contains(s)) {
return "贵州";
}
}
//云南省:昆明市、曲靖市、玉溪市、保山市、昭通市、丽江市、普洱市、临沧市
String[] yn= {"昆明","曲靖","玉溪","保山","昭通","丽江","普洱","临沧"};
for(String s:yn) {
if(hive_line.contains(s)) {
return "云南";
}
}
//西藏自治区:拉萨市、日喀则市、昌都市、林芝市、山南市、那曲市
String[] xz= {"拉萨","日喀则","昌都","林芝","山南","那曲"};
for(String s:xz) {
if(hive_line.contains(s)) {
return "西藏";
}
}
//陕西省:西安市、铜川市、宝鸡市、咸阳市、渭南市、延安市、汉中市、榆林市、安康市、商洛市
String[] xx= {"西安","铜川","宝鸡","咸阳","渭南","延安","汉中","榆林","安康","商洛"};
for(String s:xx) {
if(hive_line.contains(s)) {
return "陕西";
}
}
//甘肃省:兰州市、嘉峪关市、金昌市、白银市、天水市、武威市、张掖市、平凉市、酒泉市、庆阳市、定西市、陇南市
String[] gs= {"兰州","嘉峪关","金昌","白银","天水","武威","张掖","平凉","酒泉","庆阳","定西","陇南"};
for(String s:gs) {
if(hive_line.contains(s)) {
return "甘肃";
}
}
//青海省:西宁市、海东市
String[] qh= {"西宁","海东"};
for(String s:qh) {
if(hive_line.contains(s)) {
return "青海";
}
}
//宁夏回族自治区:银川市、石嘴山市、吴忠市、固原市、中卫市
String[] nx= {"银川","石嘴山","吴忠","固原","中卫市"};
for(String s:nx) {
if(hive_line.contains(s)) {
return "宁夏";
}
}
//新疆维吾尔自治区:乌鲁木齐市、克拉玛依市、吐鲁番市、哈密市
String[] xj= {"乌鲁木齐","克拉玛依","吐鲁番","哈密"};
for(String s:xj) {
if(hive_line.contains(s)) {
return "新疆";
}
}
}
return null;
}
//main方法测试
public static void main(String[] args) {
Hive_Location l=new Hive_Location();
System.out.println(l.evaluate("中卫市"));
}
}
运行结果:
当然还有一些其他的数据清洗,但其实都是大同小异的,主要就是要使用UDF函数,写好Java类重写UDF函数的evaluate
方法,然后导出jar
包在Hive中调用就可以了。注意:
evaluate 方法不可以接受字符串数组(String[])这样的参数!一般使用字符串(String)类型的参数,而这个参数一般就是我们数据的
字段值。
所需的必要jar包可以从这里获取
链接: https://pan.baidu.com/s/1mqZllMwiN3CR2ypOqp6K2A 提取码:
k3lj
包含了:
hadoop-common-xxx.jar
hive-exec-xxx.jar
commons-math3-xxx.jar
(hadoop-common-xxx.jar包有两种版本,使用自己适合的就可以了。)