今天帮同学处理数据, 主要是从1w多条记录中随机获取8k条, 然后再从8k条记录中随机获取2k条记录. 最后将2k条记录中随机分成10组,使得每组的记录都不重复.
下面将我的代码都贴上来, 好以后处理csv文件.
- 首先使用第三方的jar文件 javcsv.jar : 链接: http://pan.baidu.com/s/1qW5b3u0 密码: qjmx
package spt.csv;
import java.io.Serializable;
import java.nio.charset.Charset;
import spt.util.PropertyConfig;
/**
* CSV文件操作基础类.
*/
abstract public class CSVBasic implements Serializable {
private Charset charset; //编码.
private char delimiter; //分隔符.
private String fileName;
/**
* 默认编码.
*
* @return
*/
public static Charset getDefaultCharset() {
return Charset.forName(PropertyConfig.getProperty("charset"));
}
/**
* 默认分割符.
*
* @return
*/
public static char getDefaultDelimiter() {
return PropertyConfig.getProperty("delimiter").charAt(0);
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
public void setDelimiter(char delimiter) {
this.delimiter = delimiter;
}
public char getDelimiter() {
return delimiter;
}
public CSVBasic() {}
/**使用默认的分隔符和编码.
* @param fileName
*/
public CSVBasic(String fileName) {
this(fileName, getDefaultDelimiter(), getDefaultCharset());
}
public CSVBasic(String fileName, char delimiter, Charset charset) {
setFileName(fileName);
setDelimiter(delimiter);
setCharset(charset);
}
/**
*
*/
private static final long serialVersionUID = 7916808982930771124L;
}
3.读取csv文件,并映射记录为List<Map<String, String>> 对象:
package spt.csv;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import spt.util.PropertyConfig;
import com.csvreader.CsvReader;
/**
* 读取csv文件的类.
*/
public class Reader extends CSVBasic {
private CsvReader reader;
public CsvReader getReader() {
return reader;
}
public void setReader(CsvReader reader) {
this.reader = reader;
}
public Reader(String fileName) throws FileNotFoundException {
this(fileName, getDefaultDelimiter(), getDefaultCharset());
}
public Reader(String fileName, char delimiter, Charset charset)
throws FileNotFoundException {
// set before getting.
super(fileName, delimiter, charset);
setReader(new CsvReader(fileName, delimiter, charset));
}
/**根据字段列表,见每条记录映射为一个Map对象的列表.
* @param fieldNames
* 指定配置文件中字段名的'键'的列表.
* @return
*/
public List<Map<String, String>> getResult(List<String> fieldNames) {
// 每行中的每一个项是一个Map<String, String>的键值对.
List<Map<String, String>> lines = new ArrayList<Map<String, String>>();
CsvReader r = null;
try {
r = getReader();
r.readHeaders(); // 读取表头.
Map<String, String> itemMap = null; // 每一条记录是一个Map<String, String>.
while (r.readRecord()) {
itemMap = new HashMap<String, String>();
String k = null;
// 每一条记录添加键值对.
for (String fieldName : fieldNames) {
// 字段名.
k = PropertyConfig.getProperty(fieldName);
itemMap.put(k, r.get(k));
}
lines.add(itemMap);
}
return lines;
} catch (IOException e) {
e.printStackTrace();
return null;
} finally {
if(r != null)
r.close();
}
}
@Override
public String toString() {
return getFileName();
}
/**
*
*/
private static final long serialVersionUID = -1712774594374451546L;
}
4.将List<Map<String, String>>输出为csv文件的类:
package spt.csv;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import com.csvreader.CsvWriter;
/**
* csv文件写入类.
*/
public class Writer extends CSVBasic {
private CsvWriter writer = null;
public boolean write(List<String> fieldNames,
List<Map<String, String>> mapList) {
CsvWriter writer = null;
try {
writer = getWriter();
// 写入表头.
writer.writeRecord((String[]) fieldNames
.toArray(new String[fieldNames.size()]));
for (Map<String, String> map : mapList) {
// 存储每行记录.
String[] records = new String[fieldNames.size()];
for (int i = 0; i < fieldNames.size(); i++)
records[i] = map.get(fieldNames.get(i));
// 写入每行记录.
writer.writeRecord(records);
}
return true;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
} finally {
if (writer != null)
writer.close();
}
}
public Writer() {
this(null, getDefaultDelimiter(), getDefaultCharset());
}
public Writer(String fileName) {
this(fileName, getDefaultDelimiter(), getDefaultCharset());
}
public Writer(String fileName, char delimiter, Charset charset) {
super(fileName, delimiter, charset);
writer = new CsvWriter(fileName, delimiter, charset);
}
public CsvWriter getWriter() {
return writer;
}
public void setWriter(CsvWriter writer) {
this.writer = writer;
}
/**
*
*/
private static final long serialVersionUID = -9141083858975437622L;
}
5.表中有一个字段NYR, 表示时间, 由于需要将结果按照时间的先后顺序排序, 所以定义一个比较器:
package spt.csv;
import java.text.ParseException;
import java.util.Comparator;
import java.util.Map;
import spt.util.DateService;
import spt.util.PropertyConfig;
/**
*每条记录是一个Map对象,按照每条记录中的'时间'的列进行排序.
*/
public class RecordDateComparator implements Comparator<Map<String, String>> {
@Override
public int compare(Map<String, String> m1, Map<String, String> m2) {
try {
long l01 = DateService.getDate(m1.get(PropertyConfig.getProperty("NYR"))).getTime();
long l02 = DateService.getDate(m2.get(PropertyConfig.getProperty("NYR"))).getTime();
//long的范围和int的范围不同.
long diff = l01 - l02;
if(diff < 0)
return -1;
else if(diff > 0)
return 1;
return 0;
} catch (ParseException e) {
e.printStackTrace();
return 0;
}
}
}
6,在main类中:
package spt.csv;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Random;
import spt.util.PropertyConfig;
/**
* 从1w多条记录中先选出8k条,然后在8k条记录中选出2k条,最后将2k条记录分成10组.
*/
public class ReadWriteDemo {
/**
* @param args
*/
public static void main(String[] args) {
// if (args.length < 1)
// throw new NullPointerException("请指定文件路径");
System.out.println("执行中...执行过程请不要关闭此窗口!");
final int first_size = Integer.parseInt(PropertyConfig
.getProperty("first_size")); // 初次提取长度(8k).
final int second_size = Integer.parseInt(PropertyConfig
.getProperty("second_size")); // 初次提取(2k).
final int groupCount = Integer.parseInt(PropertyConfig
.getProperty("groupCount")); // 分组个数(10).
String file = PropertyConfig.getProperty("input_file"); // 源文件路径.
List<String> fieldNames = null;
try {
fieldNames = initFields();
Reader csv = new Reader(file);
// 总记录.
List<Map<String, String>> totalList = csv.getResult(fieldNames);
// 初次提取的值(8k).
List<Map<String, String>> firstTaken = random(totalList, first_size);
// 再次提取的值(2k).
List<Map<String, String>> secondTaken = random(firstTaken,
second_size);
// 每组记录数(2百).
List<Map<String, String>> tmpTaken = secondTaken;
for (int i = 0; i < groupCount; i++) {
List<Map<String, String>> AGroupTaken = random(tmpTaken,
second_size / groupCount);
// 除去上次已经使用的元素.
tmpTaken.removeAll(AGroupTaken);
// 在当前目录上输出(并验证是否存在).
String outputFile = null;
// 如果文件已存在,则自动命名.
int fileCount = 0;
do {
outputFile = "result" + fileCount++ + ".csv";
} while (new File(outputFile).exists());
Writer writer = new Writer(outputFile);
// (集合)排序.
Collections.sort(AGroupTaken, new RecordDateComparator());
writer.write(fieldNames, AGroupTaken);
}
System.out.println("done!");
} catch (FileNotFoundException e) {
System.out.println("请指定正确的文件路径!");
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 随机产生新的列表(长度比原来小).
*
* @param originalList
* 输入列表.
* @param new_size
* 新列表的长度.
*/
public static List<Map<String, String>> random(
List<Map<String, String>> originalList, int new_size) {
if (new_size <= 0 || new_size > originalList.size())
throw new IndexOutOfBoundsException("新列表的长度错误!");
List<Map<String, String>> newList = new ArrayList<Map<String, String>>(
new_size);
// 标识是否已被提取.
boolean[] taken = new boolean[originalList.size()];
Random r = new Random();
Map<String, String> map = null; // 即将获取的元素.
int rIdx = 0;
for (int i = 0; i < new_size; i++) {
do {
rIdx = r.nextInt(new_size);
map = originalList.get(rIdx);
} while (taken[rIdx]); // 如果发现已经提取,则重复操作.
taken[rIdx] = true; // 标识已被提取.
newList.add(map);
}
return newList;
}
private static List<String> initFields() {
// 所有字段.
List<String> fieldNames = new ArrayList<String>(14);
fieldNames.add("id");
fieldNames.add("AJMC");
fieldNames.add("JYAQ");
fieldNames.add("AJLB");
fieldNames.add("AJFAB");
fieldNames.add("AJZT");
fieldNames.add("BASJ");
fieldNames.add("FXSJ");
fieldNames.add("FASJSX");
fieldNames.add("FASJXX");
fieldNames.add("AJBH");
fieldNames.add("ZBX");
fieldNames.add("ZBY");
fieldNames.add("NYR");
return fieldNames;
}
}
7,用到的自定义工具类为:
package spt.util;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
/**
* 2015-2-27 提供日期转换的工具类.
*/
public class DateService {
// 定义称线程共享,而不是没调用一次就创建一个对象.
private static DateFormat formater = new SimpleDateFormat(PropertyConfig.getProperty("date_format"));
/**
* 将字符串类型的日期转换为Date.
*
* @param strDate
* @return
* @throws ParseException
*/
public static Date getDate(String strDate) throws ParseException {
// 如果输入为空,则返回null.
if (Str.isEmpty(strDate))
return null;
return formater.parse(strDate);
}
/**
* 将java.util.Date转换为java.sql.Date;用于诸如'PreparedStatement.setDate'方法.
*
* @param utilDate
* @return
*/
public static java.sql.Date getSQLDate(java.util.Date utilDate) {
if (utilDate == null)
return null;
return new java.sql.Date(utilDate.getTime());
}
/**
* 将指定的日期转换为
*
* @param date
* @return
*/
public static String getDateStr(java.util.Date date) {
if (date == null)
return null;
return formater.format(date);
}
/**
* 计算指定日期与今天的间隔,判断是否是需要日期. disDay表示与今天相隔天数,0:等于今天;1:明天;-1:昨天.
*
* @param anotherDate
* @param disDay
* @return
*/
public static boolean isSpecifiedDay(Date anotherDate, int disDay) {
if (anotherDate == null)
return false;
Calendar cNow = Calendar.getInstance();
cNow.setTime(new Date()); // 每调用一次,都是与当前时间做比较.
cNow.add(Calendar.DAY_OF_MONTH, disDay);
Calendar cAnotherDate = Calendar.getInstance();
cAnotherDate.setTime(anotherDate);
return cNow.get(Calendar.YEAR) == cAnotherDate.get(Calendar.YEAR)
&& cNow.get(Calendar.MONTH) == cAnotherDate.get(Calendar.MONTH)
&& cNow.get(Calendar.DAY_OF_MONTH) == cAnotherDate.get(Calendar.DAY_OF_MONTH);
}
}
package spt.util;
import java.io.IOException;
import java.net.URL;
import java.util.Properties;
/**
* 2015-2-27
*/
public class PropertyConfig {
/**
* @param key
* @return
*/
public static String getProperty(String key) {
Properties properties = getProperties();
return properties.getProperty(key);
}
/**
* @param resources
* @return
*/
public static Properties getProperties() {
final String configFilePath = "raw/properties.properties";
URL url = PropertyConfig.class.getClassLoader().getResource(configFilePath);
Properties props = new Properties();
try {
props.load(url.openStream());
} catch (IOException e) {
e.printStackTrace();
return null;
}
return props;
}
}
package spt.util;
/**
*字符串工具类.
*/
public class Str {
/**
* 判断一个字符串是否有内容.
*
* @param str
* @return 如果不不为空,则返回true,否则返回false.
*/
public static boolean hasLength(String str) {
return !isEmpty(str);
}
/**判断字符串是否为空.
* @param str
* @return
*/
public static boolean isEmpty(String str) {
return str == null || str.isEmpty();
}
}
其中,配置文件"raw/properties.properties"是放置在src目录下.