Tesseract Ocr 引擎识别图形验证码

原创

catoop 2022-03-23 14:22:43 博主文章分类：Java开发 ©著作权

©著作权归作者所有：来自51CTO博客作者catoop的原创作品，请联系作者获取转载授权，否则将追究法律责任

1、Tesseract介绍 tesseract 是一个google支持的开源ocr项目，其项目地址：https://github.com/tesseract-ocr/tesseract，目前最新的源码可以在这里下载。实际使用tesseract ocr也有两种方式：1- 动态库方式 libtesseract 2 - 执行程序方式 tesseract.exe

2、Tesseract安装包下载 Tesseract的release版本下载地址：https://github.com/tesseract-ocr/tesseract/wiki/Downloads

Currently, there is no official Windows installer for newer versions. 意思就是官方不提供最新版windows平台安装包，只有相对略老的3.02.02版本，其下载地址：https://sourceforge.net/projects/tesseract-ocr-alt/files/。最新版3.03和3.05版本，都是三方维护和管理的安装包，有好几个发行机构，分别是：

3rd party Windows exe's/installer binaries compiled by @egorpugin (ref issue # 209)https://www.dropbox.com/s/8t54mz39i58qslh/tesseract-3.05.00dev-win32-vc19.zip?dl=1 You have to install VC2015 x86 redist from microsoft.com in order to run them. Leptonica is built with all libs except for libjp2k. https://github.com/UB-Mannheim/tesseract/wiki http://domasofan.spdns.eu/tesseract/

3、Tesseract ocr 的使用 安装之后，默认目录C:\Program Files (x86)\Tesseract-OCR，你需要把这个路径放到你操作系统的path搜索路径中，这样用起来比较方便。在安装目录C:\Program Files (x86)\Tesseract-OCR下可以看到 tesseract.exe这个命令行执行程序。注：安装后的目录，你可以打包成压缩包拷贝到别的地方或别的电脑直接解压使用。

tesseract语法如下：例如：tesseract 1.png output -l eng -psm 7 ，表示采取单行文本方式，使用英语字库识别1.png这个图片文件，识别结果输出到当前目录output.txt文件中。其中 -psm 7 表示用单行文本识别，-l eng 表示使用英语语言。所以默认选项直接使用 “ tesseract 1.png output ” 即可。

4、Tesseract ocr 的 Java 工具类

package com.shanhy.unifiedintegral.common.ocr;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.jdesktop.swingx.util.OS;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

/**
 * 图形验证码识别
 *
 * @author   单红宇(365384722)
 * @myblog  http://blog.csdn.net/catoop/
 * @create    2016年9月2日
 */
@Component
public class OCRHelper {
  
  private final String LANG_OPTION = "-l";
  private final String EOL = System.getProperty("line.separator");
  /**
   * tesseract安装目录
   */
  @Value("${tesseractDirPath:C://Program Files (x86)//Tesseract-OCR}")//默认值C://Program Files (x86)//Tesseract-OCR
  private String tessPath;

  /**
   * @param imageFile
   *            传入的图像文件
   * @param imageFormat
   *            传入的图像格式
   * @return 识别后的字符串
   */
  public String recognizeText(File imageFile) throws Exception {
    /**
     * 设置输出文件的保存的文件目录
     */
    File outputFile = new File(imageFile.getParentFile(), "output");

    StringBuffer strB = new StringBuffer();
    List<String> cmd = new ArrayList<String>();
    if (OS.isWindowsXP()) {
      cmd.add(tessPath + "\\tesseract");
    } else if (OS.isLinux()) {
      cmd.add("tesseract");
    } else {
      cmd.add(tessPath + "\\tesseract");
    }
    cmd.add("");
    cmd.add(outputFile.getName());
    cmd.add(LANG_OPTION);
    // 设置语言参数
    // cmd.add("chi_sim");// 中文简体（需要额外安装）
    cmd.add("eng");// 英文（安装默认自带）

    ProcessBuilder pb = new ProcessBuilder();
    /**
     * Sets this process builder's working directory.
     */
    pb.directory(imageFile.getParentFile());
    cmd.set(1, imageFile.getName());
    pb.command(cmd);
    pb.redirectErrorStream(true);
    Process process = pb.start();
    // tesseract.exe 1.jpg 1 -l chi_sim
    // Runtime.getRuntime().exec("tesseract.exe 1.jpg 1 -l chi_sim");
    /**
     * the exit value of the process. By convention, 0 indicates normal
     * termination.
     */
    // System.out.println(cmd.toString());
    int w = process.waitFor();
    if (w == 0)// 0代表正常退出
    {
      BufferedReader in = new BufferedReader(
          new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath() + ".txt"), "UTF-8"));
      String str;

      while ((str = in.readLine()) != null) {
        strB.append(str).append(EOL);
      }
      in.close();
    } else {
      String msg;
      switch (w) {
      case 1:// 大部分是权限问题，当前运行的java执行权限不够
        msg = "Errors accessing files. There may be spaces in your image's filename.";
        break;
      case 29:
        msg = "Cannot recognize the image or its selected region.";
        break;
      case 31:
        msg = "Unsupported image format.";
        break;
      default:
        msg = "Errors occurred.";
      }
      throw new RuntimeException(msg);
    }
    new File(outputFile.getAbsolutePath() + ".txt").delete();
    return strB.toString().replaceAll("\\s*", "");
  }
  
  public void setTessPath(String path){
    this.tessPath = path;
  }

//  public static void main(String[] args) {
//    try {
//      OCRHelper ocr = new OCRHelper();
//      ocr.setTessPath("D://Tesseract-OCR");
//      System.out.println(ocr.recognizeText(new File("G://vcode.jpg")));
//    } catch (Exception e) {
//      e.printStackTrace();
//    }
//  }
}