写文件
需求:写入1亿行,7位以内的随机的数字。

首先看成果图,代表没骗大家!!!!!

这个是最终生成的文件,有770多MB 。下面用glogg打开预览:

程序打印耗时

7149ms + 923 ms = 8072ms ,也就是8秒,写入1个亿数据到文件!!!!(还可以参数调优)

思想
利用nio高效写文件,先写入20个小文件,最后合并,每个小文件开一个线程。

代码:

public static void main(String[] args) throws CloneNotSupportedException, InterruptedException, IOException {
     int totals = 100000000;
     int segment = 20 ;
     // 写入5亿条数据
     // 开启20个线程
     ExecutorService service = Executors.newFixedThreadPool(segment);
     AtomicInteger incr = new AtomicInteger(0);
     CountDownLatch downLatch = new CountDownLatch(segment);
     long s = System.currentTimeMillis();
     for(int j=0;j<segment;j++) {
         service.execute(()->{
              RandomAccessFile acf;
              FileChannel fc = null ;
             try {
                 String fName = "E:\\tmp_" + incr.getAndIncrement()+".txt";
                 acf = new RandomAccessFile(fName, "rw");
                 fc = acf.getChannel();
                 int offset = 0;
                 for (int i = 0; i < totals/segment/10000; i++) { //25000000
                     //每次写1w个 数字
                     StringBuilder sb = new StringBuilder();
                     for (int k=0;k<10000;k++) {
                         sb.append(new Random().nextInt(10000000) + "\n");
                     }
                 byte[] bs = sb.toString().getBytes();
                 MappedByteBuffer mbuf = fc.map(FileChannel.MapMode.READ_WRITE, offset, bs.length);
                 mbuf.put(bs);
                 offset = offset + bs.length;
             }
             } catch (Exception e) {
                 e.printStackTrace();
             }finally {
                  downLatch.countDown();
                  try {
                     fc.close();
                 } catch (IOException e) {
                     e.printStackTrace();
                 }
             }
         });
     }
     downLatch.await();
     System.out.println("await 唤醒, 小文件写入完毕! 耗時:" + (System.currentTimeMillis()-s));
     List<File> files = new ArrayList<File>();
     for(int i=0;i<segment;i++) {
         files.add(new File("E:\\tmp_" + i+".txt"));
     }
      s = System.currentTimeMillis();
     //合併文件
     merge(files, "E:\\last.txt");
     System.out.println("合併文件完毕! 耗時:" + (System.currentTimeMillis()-s));
     service.shutdown();
 } public static void merge(List<File> files , String to) {
     File t = new File(to);
     FileInputStream in = null;
     FileChannel inChannel = null;    FileOutputStream out = null ;
     FileChannel outChannel = null ;
     try {
          out = new FileOutputStream(t, true);
          outChannel = out.getChannel();
         // 记录新文件最后一个数据的位置
         long start = 0;
         for (File file : files) {
             in = new FileInputStream(file);
             inChannel = in.getChannel();
             // 从inChannel中读取file.length()长度的数据,写入outChannel的start处
             outChannel.transferFrom(inChannel, start, file.length());
             start += file.length();
             in.close();
             inChannel.close();
         }
     }catch (Exception e) {
         e.printStackTrace();
     } finally {
         try {
             out.close();
             outChannel.close();
         } catch (Exception e2) {
         }
     }
 }


读文件
先看效果图:

读取了100000000行,花了18341 也就是18秒。而且无论原始文件多大都不会OOM 。

思想

先将大原始文件拆分成小文件。
开多线程分批行读取。
代码: (这种方法不合适,太耗时了!!!后面有更高效的方法只要700毫秒)

public static void main(String[] args) throws CloneNotSupportedException, InterruptedException, IOException {
         int totals = 100000000;
         int segment = 20;
         ExecutorService service = Executors.newFixedThreadPool(segment);
         // 将文件拆分
         long s = System.currentTimeMillis();
         splitFileByLine("E:\\last.txt", "E:\\", totals / segment);
         System.out.println("拆分文件耗時: " + (System.currentTimeMillis() - s));
         AtomicInteger incr = new AtomicInteger(1);        AtomicInteger total = new AtomicInteger(0);
         CountDownLatch downLatch = new CountDownLatch(segment);
         for (int i = 1; i <= segment; i++) {
             service.execute(() -> {
                 try {
                     readFile("E:\\last-" + incr.getAndIncrement() + ".txt", line -> {
                         //这里千万不要 打印 line , 太耗时了
                         total.getAndIncrement();
                     });
                 } finally {
                     downLatch.countDown();
                 }
             });
         }
         downLatch.await();
         System.out.println("读取文件总耗时:" + (System.currentTimeMillis() - s) + " , 读取数据行:" + total.get());
         service.shutdown();
     }    /**
      * 按行分割文件
      * 
      * @param sourceFilePath      为源文件路径
      * @param targetDirectoryPath 文件分割后存放的目标目录
      * @param rows                为多少行一个文件
      */
     public static int splitFileByLine(String sourceFilePath, String targetDirectoryPath, int rows) {
         String sourceFileName = sourceFilePath.substring(sourceFilePath.lastIndexOf(File.separator) + 1,
                 sourceFilePath.lastIndexOf("."));// 源文件名
         String splitFileName = targetDirectoryPath + File.separator + sourceFileName + "-%s.txt";// 切割后的文件名
         File targetDirectory = new File(targetDirectoryPath);
         if (!targetDirectory.exists()) {
             targetDirectory.mkdirs();
         }
         PrintWriter pw = null;// 字符输出流
         String tempLine;
         int lineNum = 0;// 本次行数累计 , 达到rows开辟新文件
         int splitFileIndex = 1;// 当前文件索引        try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFilePath)))) {
             pw = new PrintWriter(String.format(splitFileName, splitFileIndex));
             while ((tempLine = br.readLine()) != null) {
                 if (lineNum > 0 && lineNum % rows == 0) {// 需要换新文件
                     pw.flush();
                     pw.close();
                     pw = new PrintWriter(String.format(splitFileName, ++splitFileIndex));
                 }
                 pw.write(tempLine + "\n");
                 lineNum++;
             }
             return splitFileIndex;
         } catch (Exception e) {
             e.printStackTrace();
             return -1;
         } finally {
             if (null != pw) {
                 pw.flush();
                 pw.close();
             }
         }
     }        public interface Callback {
         public void onReceive(String line);
     }    /**
      * 大文件读取
      * 
      * @param filePath
      */
     public static void readFile(String filePath, Callback callback) {
         File file = new File(filePath);
         BufferedReader reader = null;
         try {
             reader = new BufferedReader(new FileReader(file), 20 * 1024 * 1024); // 如果是读大文件,设置缓存
             String tempString = null;
             while ((tempString = reader.readLine()) != null) {
                 callback.onReceive(tempString);
             }
             reader.close();
         } catch (IOException e) {
             e.printStackTrace();
         } finally {
             if (reader != null) {
                 try {
                     reader.close();
                 } catch (IOException e) {
                     e.printStackTrace();
                 }
             }
         }
     }
 1


读取待优化点
可以换成NIO的切割文件方式,这是我读取一亿行数据耗时结果图:

切分成20个文件,耗时 :743ms 。 加上读取一起耗时: 3252 ms。 有质的提升!!!

切分后的文件截图: