最近一直在做地域统计的功能,用户下载app的日志中记录了IP,老大要根据这个IP做个地域统计,看看哪些地方的用户喜欢下载哪些应用。最初是用Java跑纯真IP地址数据库,然后对日志进行统计,不过效率不高,20+M的数据要跑几个小时,后来就把任务给我了,让我用Hadoop跑跑试试。

1. ip的解析,把ip地址解析成实际地址:使用了纯真ip数据库QQwry.dat

    参考代码:

2.新建一个hadoop项目,将ip解析的3个类放到一个包下面


3.QQwry的测试使用:项目中需要拿到ip的国家和省,然后追加到原始日志作为2个新字段,但是测试发现返回的数据不能直接拿来处理,所有就需要进一步的格式化处理。


  @1.对于:内蒙,广西,新疆,宁夏,西藏这5个自治区,直接返回中国+自治区名 

  @2,包含省字的,直接返回中国+省份,

  @3,包含市的,主要是上海,北京,重庆,天津4个直辖市,返回中国+城市

  @5,包含中国的,直接返回中国,省份字段留空;

  @6.其他就直接返回解析的数据,省份留空  (基本就是国外的)

  @4,测试中发现这样处理不干净,还有很多脏数据,例如包含大学和学院的,还有欧洲中部,XX网吧等。所有需要对这些脏数据进行处理。采用的方法是将脏数据按一定格式保存到dirtydata.txt中,然后初始化到一个map中,利用map进行格式化。

4.根据3中的格式化需求,编写格式化函数

private String formatCity(String country) {
	// 特殊地区处理,
	for (String spe : spelist) {
		if (country.indexOf(spe) != -1)
			return "中国," + spe;
	}
	if (country.indexOf("省") != -1) {
		String contrysplit[] = country.split("省");
		return "中国," + contrysplit[0] + "省";
	else if (country.indexOf("市") != -1) {
		String citysplist[] = country.split("市");
		return "中国," + citysplist[0] + "市";
	} else if (umap.containsKey(country)) {
		eturn "中国," + umap.get(country);
	} else if (country.indexOf("中国") != -1) {
		return "中国," + "";
	} else {
		return country + "," + "";
	}
}
private String formatCity(String country) {
	// 特殊地区处理,
	for (String spe : spelist) {
		if (country.indexOf(spe) != -1)
			return "中国," + spe;
	}
	if (country.indexOf("省") != -1) {
		String contrysplit[] = country.split("省");
		return "中国," + contrysplit[0] + "省";
	else if (country.indexOf("市") != -1) {
		String citysplist[] = country.split("市");
		return "中国," + citysplist[0] + "市";
	} else if (umap.containsKey(country)) {
		eturn "中国," + umap.get(country);
	} else if (country.indexOf("中国") != -1) {
		return "中国," + "";
	} else {
		return country + "," + "";
	}
}
5.对于脏数据的解析,读取txt文件,然后编写解析函数
public Map<String, String> getUniversMap(String filepath)
				throws FileNotFoundException {
	Map<String, String> universMap = new HashMap<String, String>();
	FileReader fr = new FileReader(filepath);
	BufferedReader br = new BufferedReader(fr);
	String readoneline;
	String tmp[];
	try {
		while ((readoneline = br.readLine()) != null) {
		    tmp = readoneline.split(",");
		    if(tmp.length == 3){
			universMap.put(tmp[0], tmp[2]);
		    }
		}
	} catch (IOException e) {
		e.printStackTrace();
	}
	return universMap;
}
5.对于脏数据的解析,读取txt文件,然后编写解析函数
public Map<String, String> getUniversMap(String filepath)
				throws FileNotFoundException {
	Map<String, String> universMap = new HashMap<String, String>();
	FileReader fr = new FileReader(filepath);
	BufferedReader br = new BufferedReader(fr);
	String readoneline;
	String tmp[];
	try {
		while ((readoneline = br.readLine()) != null) {
		    tmp = readoneline.split(",");
		    if(tmp.length == 3){
			universMap.put(tmp[0], tmp[2]);
		    }
		}
	} catch (IOException e) {
		e.printStackTrace();
	}
	return universMap;
}

6.编写map/reduce程序,调用1中的函数,然后对结果进行格式化

public class ConvertIp {

	public static class ItemMapper extends
			Mapper<Object, Text, Text, NullWritable> {
		private Text outkey = new Text("");
		private IPSeeker ipSeeker;
		private String filepath;
		Map<String, String> umap;
		final String spelist[] = { "内蒙古", "广西", "新疆", "宁夏", "西藏" };
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			String line = value.toString();
			String details[] = line.split(",");
			if(details.length != 15){
				return;
			}
			String ip = details[3];
			String reg = "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})";
			// data clien 1.length=15 2.the university
			if (ip.matches(reg)) {
				outkey.set(new StringBuffer().append(line).append(
						",").append(formatCity(ipSeeker.getCountry(ip)))
						.toString());

				context.write(outkey, NullWritable.get());
			}
		}

		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			ipSeeker = new IPSeeker("qqwry.dat");  //初始化,这种写法需要在执行hadoop命令时
使用-files +qqwry.dat的路径,+dirtydata.txt的路径(上传到job的临时目录下)
                        filepath = "dirtydata.txt";         //初始化
			try {
				umap = getUniversMap(filepath);
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			}
			super.setup(context);
		}

		private String formatCity(String country) {
			for (String spe : spelist) {
				if (country.indexOf(spe) != -1)
					return "中国," + spe;
			}
			if (country.indexOf("省") != -1) {
				String contrysplit[] = country.split("省");
				return "中国," + contrysplit[0] + "省";
			} else if (country.indexOf("市") != -1) {
				String citysplist[] = country.split("市");
				return "中国," + citysplist[0] + "市";
			} else if (umap.containsKey(country)) {
				return "中国," + umap.get(country);
			} else if (country.indexOf("中国") != -1) {
				return "中国," + "";
			} else {
				return country + "," + "";
			}
		}

		public Map<String, String> getUniversMap(String filepath)
				throws FileNotFoundException {
			Map<String, String> universMap = new HashMap<String, String>();
			FileReader fr = new FileReader(filepath);
			BufferedReader br = new BufferedReader(fr);
			String readoneline;
			String tmp[];
			try {
				while ((readoneline = br.readLine()) != null) {
					tmp = readoneline.split(",");
					if(tmp.length == 3){
						universMap.put(tmp[0], tmp[2]);
					}
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			return universMap;
		}
	}

	public static class ItemReducer extends
			Reducer<Text, NullWritable, Text, NullWritable> {

		protected void reduce(Text key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
		}

	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: -file <file> <in> <out>");
			System.exit(2);
		}

		Job job = new Job(conf, "ipcount");
		job.setJarByClass(ConvertIp.class);
		job.setMapperClass(ItemMapper.class);
		job.setReducerClass(ItemReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}
public class ConvertIp {

	public static class ItemMapper extends
			Mapper<Object, Text, Text, NullWritable> {
		private Text outkey = new Text("");
		private IPSeeker ipSeeker;
		private String filepath;
		Map<String, String> umap;
		final String spelist[] = { "内蒙古", "广西", "新疆", "宁夏", "西藏" };
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			String line = value.toString();
			String details[] = line.split(",");
			if(details.length != 15){
				return;
			}
			String ip = details[3];
			String reg = "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})";
			// data clien 1.length=15 2.the university
			if (ip.matches(reg)) {
				outkey.set(new StringBuffer().append(line).append(
						",").append(formatCity(ipSeeker.getCountry(ip)))
						.toString());

				context.write(outkey, NullWritable.get());
			}
		}

		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			ipSeeker = new IPSeeker("qqwry.dat");  //初始化,这种写法需要在执行hadoop命令时
使用-files +qqwry.dat的路径,+dirtydata.txt的路径(上传到job的临时目录下)
                        filepath = "dirtydata.txt";         //初始化
			try {
				umap = getUniversMap(filepath);
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			}
			super.setup(context);
		}

		private String formatCity(String country) {
			for (String spe : spelist) {
				if (country.indexOf(spe) != -1)
					return "中国," + spe;
			}
			if (country.indexOf("省") != -1) {
				String contrysplit[] = country.split("省");
				return "中国," + contrysplit[0] + "省";
			} else if (country.indexOf("市") != -1) {
				String citysplist[] = country.split("市");
				return "中国," + citysplist[0] + "市";
			} else if (umap.containsKey(country)) {
				return "中国," + umap.get(country);
			} else if (country.indexOf("中国") != -1) {
				return "中国," + "";
			} else {
				return country + "," + "";
			}
		}

		public Map<String, String> getUniversMap(String filepath)
				throws FileNotFoundException {
			Map<String, String> universMap = new HashMap<String, String>();
			FileReader fr = new FileReader(filepath);
			BufferedReader br = new BufferedReader(fr);
			String readoneline;
			String tmp[];
			try {
				while ((readoneline = br.readLine()) != null) {
					tmp = readoneline.split(",");
					if(tmp.length == 3){
						universMap.put(tmp[0], tmp[2]);
					}
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			return universMap;
		}
	}

	public static class ItemReducer extends
			Reducer<Text, NullWritable, Text, NullWritable> {

		protected void reduce(Text key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
		}

	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: -file <file> <in> <out>");
			System.exit(2);
		}

		Job job = new Job(conf, "ipcount");
		job.setJarByClass(ConvertIp.class);
		job.setMapperClass(ItemMapper.class);
		job.setReducerClass(ItemReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

7.打包成jar,上传到服务器,然后进行测试

   

hadoop jar /home/wjk/ipcount.jar com.wjk.datastat.ip.ConvertIp -files /home/wjk/qqwry.dat,/home/wjk/dirtydata.txt  /user/hive/warehouse/active_log/dt=20120301  /user/wjk/output/datastat
hadoop jar /home/wjk/ipcount.jar com.wjk.datastat.ip.ConvertIp -files /home/wjk/qqwry.dat,/home/wjk/dirtydata.txt  /user/hive/warehouse/active_log/dt=20120301  /user/wjk/output/datastat