1 2 3 Previous Next 31 Replies Latest reply: Apr 27, 2010 4:51 PM by 843810 Go to original post RSS
      • 30. Re: How to detect what charater encoding a file is in
        843810
        Here is an implementation of Mozilla JChardet encoding detector:

        1 - Create a class which implements nsICharsetDetectionObserver Jchardet Interface:

        package com.yourcompany.helpers;

        import org.apache.log4j.Logger;
        import org.mozilla.intl.chardet.nsICharsetDetectionObserver;

        public class CSVOutputCharsetObserver implements nsICharsetDetectionObserver {

             
             static int READY=2;
             static int RUNNING=1;
             static int IDLE=0;
             
             private int status = IDLE;
             private String charsetToUse = "";
             
             static Logger logger = Logger.getLogger(CSVOutputCharsetObserver.class);
             public void Notify(String charset) {
             
                  logger.debug(" >> charset detected --> " + charset);
                  
                  charsetToUse = charset;
        status = READY;
             }

             public String getCharsetToUse() {
                  return charsetToUse;
             }

             public void setCharsetToUse(String charsetToUse) {
                  this.charsetToUse = charsetToUse;
             }

             public int getStatus() {
                  return status;
             }

             public void setStatus(int status) {
                  this.status = status;
             }

        }


        2 - Create a Helper class to be a client of Jchardet:

        package com.yourcompany.helpers;

        import java.io.BufferedInputStream;
        import java.io.ByteArrayInputStream;
        import java.io.IOException;
        import java.io.UnsupportedEncodingException;


        import org.apache.log4j.Logger;
        import org.mozilla.intl.chardet.nsDetector;
        import org.mozilla.intl.chardet.nsPSMDetector;

        public class CSVOutputHelper {

             Logger logger = Logger.getLogger(CSVOutputHelper.class);
             public String detectCharsetEncodingForTheGivenString(String fileName) {
                  
             FileInputStream f = null;
                  try {
                       f = new FileInputStream(fileName);
                  } catch (FileNotFoundException e1) {
                       // TODO Auto-generated catch block
                       e1.printStackTrace();
                  }
                  BufferedInputStream buff = new BufferedInputStream(f);
                  
                  int lang = nsPSMDetector.ALL; //or nsPSMDetector.JAPANESE and so. See Jchardet API for more information
                  
                  nsDetector det = new nsDetector(lang);
                  CSVOutputCharsetObserver obsvr = new CSVOutputCharsetObserver();
                  det.Init(obsvr);
                  
                  byte[] buf = new byte[1024] ; // reads a chunk of 1024 bytes at a time
        int len;
        boolean done = false ;
        boolean isAscii = true ;

        try {
                       while( (len=buff.read(buf,0,buf.length)) != -1) {

                       // Check if the stream is only ascii.
                       if (isAscii)
                       isAscii = det.isAscii(buf,len);
                       
                       // DoIt if non-ascii and not done yet.
        // Here I've removed the && !done
        //because I wanted to see all charsets encodings
        //that Jchardet detects in a file

                       if (!isAscii)
                       done = det.DoIt(buf,len, false);
                       
                       
                       }
                  } catch (IOException e) {
                       // TODO Auto-generated catch block
                       e.printStackTrace();
                  obsvr.setCharsetToUse("UTF-8");
                  } finally {
                       det.DataEnd();
                  }

        if (isAscii){
             obsvr.setCharsetToUse("ASCII");
        }
                  
                  return obsvr.getCharsetToUse();
             }
             
             
             
        }


        Hope that helps.
        • 31. Re: How to detect what charater encoding a file is in
          843810
          I'm gonna add this solutions here for the benefit of anyone who comes across this thread and needs help:
          In short, you can properly read in a stream or bytes using the ICU4J CharsetDetector helper class:
          [http://bit.ly/byGArx|http://bit.ly/byGArx]
          1 2 3 Previous Next