Java_Hive_UDF函数清洗数据_清洗出全国的省份数据

最近用Hadoop搞数据清洗,需要根据原始的地区数据清洗出对应的省份数据,当然我这里主要清洗的是内陆地区的数据,原始数据中不包含港澳台地区的数据,用了最简单直接的方式解决,这种方式思路很简单就是简单的归类可以参考一下,但是就是费事,要找全国的地区数据:

import org.apache.hadoop.hive.ql.exec.UDF;

public class Hive_Location extends UDF{
	//UDF必须调用 evaluate 方法
	public String evaluate(String hive_line) {
		//hive_line  代表从hive读入的字段值
		
		//直辖市 :北京市、上海市、天津市、重庆市
		if(hive_line.contains("北京")) {
			return "北京";
		}else if(hive_line.contains("上海")) {
			return "上海";
		}else if(hive_line.contains("天津")) {
			return "天津";
		}else if(hive_line.contains("重庆")) {
			return "重庆";
		}else {
			/*省份 :河北省、山西省、辽宁省、吉林省、黑龙江省、江苏省、浙江省、安徽省、
			福建省、江西省、山东省、河南省、湖北省、湖南省、广东省、海南省、四川省、贵州省、
			云南省、陕西省、甘肃省、青海省、台湾省*/
			
			//河北:石家庄市、唐山市、秦皇岛市、邯郸市、邢台市、保定市、张家口市、承德市、沧州市、廊坊市、衡水市
			String[] hb= {"石家庄","唐山","秦皇岛","邯郸","邢台","保定","张家口","承德","沧州","廊坊","衡水"};
			for(String s:hb) {
				if(hive_line.contains(s)) {
					return "河北";
				}
			}
			
			//山西:太原、大同、朔州、忻州、阳泉、吕梁、晋中、长治、晋城、临汾、运城
			String[] sx= {"太原","大同","朔州","忻州","阳泉","吕梁","晋中","长治","晋城","临汾","运城"};
			for(String s:sx) {
				if(hive_line.contains(s)) {
					return "山西";
				}
			}
			
			//辽宁:沈阳市、大连市、鞍山市、抚顺市、本溪市、丹东市、锦州市、营口市、阜新市、辽阳市、盘锦市、铁岭市、朝阳市、葫芦岛市
			String[] ln= {"沈阳","大连","鞍山","抚顺","本溪","丹东","锦州","营口","阜新","辽阳","盘锦","铁岭","朝阳","葫芦岛"};
			for(String s:ln) {
				if(hive_line.contains(s)) {
					return "辽宁";
				}
			}
			
			//吉林省:长春市、吉林市、四平市、通化市、白城市、辽源市、松原市、白山市、延边朝鲜族自治州
			String[] cc= {"长春","吉林","四平","通化","白城","辽源","松原","白山","延边朝鲜"};
			for(String s:cc) {
				if(hive_line.contains(s)) {
					return "吉林";
				}
			}
			
			//黑龙江省:哈尔滨市、齐齐哈尔市、牡丹江市、佳木斯市、大庆市、鸡西市、双鸭山市、伊春市、七台河市、鹤岗市、绥化市、黑河市、大兴安岭地区
			String[] hlj= {"哈尔滨","齐齐哈尔","牡丹江","佳木斯","大庆","鸡西","双鸭山","伊春","七台河","鹤岗","绥化","黑河","大兴安岭"};
			for(String s:hlj) {
				if(hive_line.contains(s)) {
					return "黑龙江";
				}
			}
			
			//江苏省:南京市、无锡市、徐州市、常州市、苏州市、南通市、连云港市、淮安市、盐城市、扬州市、镇江市、泰州市、宿迁市
			String[] nj= {"南京","无锡","徐州","常州","苏州","南通","连云港","淮安","盐城","扬州","镇江","泰州","宿迁"};
			for(String s:nj) {
				if(hive_line.contains(s)) {
					return "南京";
				}
			}
			
			//浙江省:杭州市、宁波市、温州市、嘉兴市、湖州市、绍兴市、金华市、衢州市、舟山市、台州市、丽水市
			String[] zj= {"杭州","宁波","温州","嘉兴","湖州","绍兴","金华","衢州","舟山","台州","丽水"};
			for(String s:zj) {
				if(hive_line.contains(s)) {
					return "浙江";
				}
			}
			
			//安徽省:合肥市、芜湖市、蚌埠市、淮南市、马鞍山市、淮北市、铜陵市、安庆市、黄山市、阜阳市、宿州市、滁州市、六安市、宣城市、池州市、亳州市
			String[] ah= {"合肥","芜湖","蚌埠","淮南","马鞍山","淮北","铜陵","安庆","黄山","阜阳","宿州","滁州","六安","宣城","池州","亳州"};
			for(String s:ah) {
				if(hive_line.contains(s)) {
					return "安徽";
				}
			}
			
			//福建省:福州市、厦门市、莆田市、三明市、泉州市、漳州市、南平市、龙岩市、宁德市
			String[] fj= {"福州","厦门","莆田","三明","泉州","漳州","南平","龙岩","宁德"};
			for(String s:fj) {
				if(hive_line.contains(s)) {
					return "福建";
				}
			}
			
			//江西省:南昌市、景德镇市、萍乡市、九江市、抚州市、鹰潭市、赣州市、吉安市、宜春市、新余市、上饶市
			String[] jx= {"南昌","景德镇","萍乡","九江","抚州","鹰潭","赣州","吉安","宜春","新余","上饶"};
			for(String s:jx) {
				if(hive_line.contains(s)) {
					return "江西";
				}
			}
			
			//山东省:济南市、青岛市、淄博市、枣庄市、东营市、烟台市、潍坊市、济宁市、泰安市、威海市、日照市、临沂市、德州市、聊城市、滨州市、菏泽市
			String[] sd= {"济南","青岛","淄博","枣庄","东营","烟台","潍坊","济宁","泰安","威海","日照","临沂","德州","聊城","滨州","菏泽"};
			for(String s:sd) {
				if(hive_line.contains(s)) {
					return "山东";
				}
			}
			
			//河南省:郑州市、开封市、洛阳市、平顶山市、安阳市、鹤壁市、新乡市、焦作市、濮阳市、许昌市、漯河市、三门峡市、南阳市、商丘市、信阳市、周口市、驻马店市
			String[] hn= {"郑州","开封","洛阳","平顶山","安阳","鹤壁","新乡","焦作","濮阳","许昌","漯河","三门峡","南阳","商丘","信阳","周口","驻马店"};
			for(String s:hn) {
				if(hive_line.contains(s)) {
					return "河南";
				}
			}
			
			//湖北省:武汉市、黄石市、十堰市、宜昌市、襄阳市、鄂州市、荆门市、孝感市、荆州市、黄冈市、咸宁市、随州市
			String[] wh= {"武汉","黄石","十堰","宜昌","襄阳","鄂州","荆门","孝感","荆州","黄冈","咸宁","随州"};
			for(String s:wh) {
				if(hive_line.contains(s)) {
					return "武汉";
				}
			}
			
			//湖南省:长沙市、株洲市、湘潭市、衡阳市、邵阳市、岳阳市、常德市、张家界市、益阳市、郴州市、永州市、怀化市、娄底市
			String[] cs= {"长沙","株洲","湘潭","衡阳","邵阳","岳阳","常德","张家界","益阳","郴州","永州","怀化","娄底"};
			for(String s:cs) {
				if(hive_line.contains(s)) {
					return "湖南";
				}
			}
			
			//广东省:广州市、韶关市、深圳市、珠海市、汕头市、佛山市、江门市、湛江市、茂名市、肇庆市、惠州市、梅州市、汕尾市、河源市、阳江市、清远市、东莞市、中山市、潮州市、揭阳市、云浮市
			String[] gd= {"广州","韶关","深圳","珠海","汕头","佛山","江门","湛江","茂名","肇庆","惠州","梅州","汕尾","河源","阳江","清远","东莞","中山","潮州","揭阳","云浮"};
			for(String s:gd) {
				if(hive_line.contains(s)) {
					return "广东";
				}
			}
			
			//广西壮族自治区:南宁市、柳州市、桂林市、梧州市、北海市、防城港市、钦州市、贵港市、玉林市、百色市、贺州市、河池市、来宾市、崇左市
			String[] gx={"南宁","柳州","桂林","梧州","北海","防城港","钦州","贵港","玉林","百色","贺州","河池","来宾","崇左"};
			for(String s:gx) {
				if(hive_line.contains(s)) {
					return "广西";
				}
			}
			
			//海南省:海口市、三亚市、三沙市、儋州市
			String[] hns= {"海口","三亚","三沙","儋州"};
			for(String s:hns) {
				if(hive_line.contains(s)) {
					return "海南";
				}
			}
			
			//四川省:成都市、自贡市、攀枝花市、泸州市、德阳市、绵阳市、广元市、遂宁市、内江市、乐山市、南充市、眉山市、宜宾市、广安市、达州市、雅安市、巴中市、资阳市
			String [] sc= {"成都","自贡","攀枝花","泸州","德阳","绵阳","广元","遂宁","内江","乐山","南充","眉山","宜宾","广安","达州","雅安","巴中","资阳"};
			for(String s:sc) {
				if(hive_line.contains(s)) {
					return "四川";
				}
			}
			
			//贵州省:贵阳市、六盘水市、遵义市、安顺市、毕节市、铜仁市
			String[] gz= {"贵阳","六盘水","遵义","安顺","毕节","铜仁"};
			for(String s:gz) {
				if(hive_line.contains(s)) {
					return "贵州";
				}
			}
			
			//云南省:昆明市、曲靖市、玉溪市、保山市、昭通市、丽江市、普洱市、临沧市
			String[] yn= {"昆明","曲靖","玉溪","保山","昭通","丽江","普洱","临沧"};
			for(String s:yn) {
				if(hive_line.contains(s)) {
					return "云南";
				}
			}
			
			//西藏自治区:拉萨市、日喀则市、昌都市、林芝市、山南市、那曲市
			String[] xz= {"拉萨","日喀则","昌都","林芝","山南","那曲"};
			for(String s:xz) {
				if(hive_line.contains(s)) {
					return "西藏";
				}
			}
			
			//陕西省:西安市、铜川市、宝鸡市、咸阳市、渭南市、延安市、汉中市、榆林市、安康市、商洛市
			String[] xx= {"西安","铜川","宝鸡","咸阳","渭南","延安","汉中","榆林","安康","商洛"};
			for(String s:xx) {
				if(hive_line.contains(s)) {
					return "陕西";
				}
			}
			
			//甘肃省:兰州市、嘉峪关市、金昌市、白银市、天水市、武威市、张掖市、平凉市、酒泉市、庆阳市、定西市、陇南市
			String[] gs= {"兰州","嘉峪关","金昌","白银","天水","武威","张掖","平凉","酒泉","庆阳","定西","陇南"};
			for(String s:gs) {
				if(hive_line.contains(s)) {
					return "甘肃";
				}
			}
			
			//青海省:西宁市、海东市
			String[] qh= {"西宁","海东"};
			for(String s:qh) {
				if(hive_line.contains(s)) {
					return "青海";
				}
			}
			
			//宁夏回族自治区:银川市、石嘴山市、吴忠市、固原市、中卫市
			String[] nx= {"银川","石嘴山","吴忠","固原","中卫市"};
			for(String s:nx) {
				if(hive_line.contains(s)) {
					return "宁夏";
				}
			}
			
			//新疆维吾尔自治区:乌鲁木齐市、克拉玛依市、吐鲁番市、哈密市
			String[] xj= {"乌鲁木齐","克拉玛依","吐鲁番","哈密"};
			for(String s:xj) {
				if(hive_line.contains(s)) {
					return "新疆";
				}
			}
		}
		
		return null;
	}
	//main方法测试
	public static void main(String[] args) {
		Hive_Location l=new Hive_Location();
		System.out.println(l.evaluate("中卫市"));
	}
}

运行结果:

hive清洗mongo数据 hive做数据清洗_hadoop

当然还有一些其他的数据清洗,但其实都是大同小异的,主要就是要使用UDF函数,写好Java类重写UDF函数的evaluate方法,然后导出jar包在Hive中调用就可以了。
注意:evaluate 方法不可以接受字符串数组(String[])这样的参数!一般使用字符串(String)类型的参数,而这个参数一般就是我们数据的字段值。
所需的必要jar包可以从这里获取

链接: https://pan.baidu.com/s/1mqZllMwiN3CR2ypOqp6K2A 提取码:k3lj

包含了:
hadoop-common-xxx.jar
hive-exec-xxx.jar
commons-math3-xxx.jar
(hadoop-common-xxx.jar包有两种版本,使用自己适合的就可以了。)