python语言文本识别 python文本检测

转载

bingfeng 2024-06-25 04:23:03

文章标签 python语言文本识别 Java ITeye .net 浏览器 文章分类 Python 后端开发

自动检测文本的字符集
（Java port of Mozilla charset detector 工具的使用方法）

Java port of Mozilla charset detector是Mozilla组织开发的用于浏览器自动识别网页字符集的工具的一个JAVA实现。Mozilla有关该工具的地址是：http://www.mozilla.org/projects/intl/chardet.html 。
本JAVA实现版本的地址是：http://sourceforge.net/projects/jchardet/ 。
在JAVA系统开发中，经常遇到字符集的问题（其实不是JAVA的问题，而是软件行业的历史遗留问题），在时候不知道文本内容的编码方式，在处理的时候出现一些意想不到的问题，通过使用此工具，虽不能百分百准确检测字符集，但还可将就用用。此工具检测效率不是很高，使用后对系统性能会有一点影响，对性能要求很高的系统谨慎使用。

下面是如何使用此工具的一个简单说明：

第一步：
在需要检测字符集的类实现接口nsICharsetDetectionObserver，需要实现其中的一个方法：Notify()。该方法会在检测结束的时候被调用并返回检测的字符集（不论对错）。

接口代码如下：

package org.mozilla.intl.chardet ;

import java.lang.* ;

public interface nsICharsetDetectionObserver {
  public void Notify(String charset) ;
}

第二步：
初始化nsDetector类，然后对字符串流进行检测，如果检测到有非ASCII的字符，则调用nsDetector的DoIt()方法进行检测。

第三步：
当字符串流处理结果后，调用nsDetector的DataEnd()方法，检测引擎会向调用者返回检测到的字符集。下面是一个调用该工具检测指定URL的页面编码方式的例子：

package org.mozilla.intl.chardet ;
import java.io.* ;
import java.net.* ;
import java.util.* ;
import org.mozilla.intl.chardet.* ;

public class HtmlCharsetDetector {

    public static boolean found = false ;

    public static void main(String argv[]) throws Exception {

	if (argv.length != 1 && argv.length != 2) {

	  System.out.println("Usage: HtmlCharsetDetector <url> [<languageHint>]");

	  System.out.println("");
	  System.out.println("Where <url> is http://...");
	  System.out.println("For optional <languageHint>. Use following...");
	  System.out.println("		1 => Japanese");
	  System.out.println("		2 => Chinese");
	  System.out.println("		3 => Simplified Chinese");
	  System.out.println("		4 => Traditional Chinese");
	  System.out.println("		5 => Korean");
	  System.out.println("		6 => Dont know (default)");

	  return ;
	} 


	//初始化nsDetector() ;
	int lang = (argv.length == 2)? Integer.parseInt(argv[1]) : nsPSMDetector.ALL ;
	nsDetector det = new nsDetector(lang) ;

	//设置观察者...
	//如果检测出了字符集，则会调用Notify()方法.

	det.Init(new nsICharsetDetectionObserver() {
		public void Notify(String charset) {
		    HtmlCharsetDetector.found = true ;
		    System.out.println("CHARSET = " + charset);
		}
    	});

	URL url = new URL(argv[0]);
	BufferedInputStream imp = new BufferedInputStream(url.openStream());
	
	byte[] buf = new byte[1024] ;
	int len;
	boolean done = false ;
	boolean isAscii = true ;
	   
	while( (len=imp.read(buf,0,buf.length)) != -1) {

		//检测字符串流中是否只包含ASCII
		if (isAscii)
		    isAscii = det.isAscii(buf,len);

                //如果包含非ASCII字符，则调用DoIt()方法进行检测
		if (!isAscii && !done)
 		    done = det.DoIt(buf,len, false);
	}
	det.DataEnd();

	if (isAscii) {
	   System.out.println("CHARSET = ASCII");
	   found = true ;
	}

	if (!found) {
	   String prob[] = det.getProbableCharsets() ;
	   for(int i=0; i<prob.length; i++) {
		System.out.println("Probable Charset = " + prob[i]);
	   }
	}
    }
}

下面是检测文本文件字符编码的例子，跟检测URL的差不多，只改了输入流获取方式：

package com.iteye.sheng.util.tools;

import java.io.* ;
import java.net.* ;
import java.util.* ;
import org.mozilla.intl.chardet.* ;

public class FileCharsetDetector {

  public static boolean found = false ;

  public static void main(String argv[]) throws Exception {

    if (argv.length != 1 && argv.length != 2) {

      System.out.println("Usage: FileCharsetDetector <file> [<languageHint>]");

      System.out.println("");
      System.out.println("Where <file> is C:/xxxxx.txt");
      System.out.println("For optional <languageHint>. Use following...");
      System.out.println("        1 => Japanese");
      System.out.println("        2 => Chinese");
      System.out.println("        3 => Simplified Chinese");
      System.out.println("        4 => Traditional Chinese");
      System.out.println("        5 => Korean");
      System.out.println("        6 => Dont know (default)");

      return ;
    }


    //初始化nsDetector() ;
    int lang = (argv.length == 2)? Integer.parseInt(argv[1]) : nsPSMDetector.ALL ;
    nsDetector det = new nsDetector(lang) ;

    //设置观察者...
    //如果检测出了字符集，则会调用Notify()方法.
    det.Init(new nsICharsetDetectionObserver() {
      public void Notify(String charset) {
        FileCharsetDetector.found = true ;
        System.out.println("CHARSET = " + charset);
      }
    });

    String filename = argv[0];
    BufferedInputStream imp = new BufferedInputStream(new FileInputStream(filename));
    
    byte[] buf = new byte[1024] ;
    int len;
    boolean done = false ;
    boolean isAscii = true ;
       
    while( (len=imp.read(buf,0,buf.length)) != -1) {

      //检测字符串流中是否只包含ASCII
      if (isAscii)
        isAscii = det.isAscii(buf,len);
      
      //如果包含非ASCII字符，则调用DoIt()方法进行检测
      if (!isAscii && !done)
        done = det.DoIt(buf,len, false);
    }
    det.DataEnd();

    if (isAscii) {
      System.out.println("CHARSET = ASCII");
      found = true ;
    }

    if (!found) {
      String prob[] = det.getProbableCharsets() ;
      for(int i=0; i<prob.length; i++) {
        System.out.println("Probable Charset = " + prob[i]);
      }
    }
  }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。