skip to main | skip to sidebar

Java Programs and Examples with Output

Pages

▼
 
  • RSS
  • Twitter
Wednesday, September 19, 2012

Unicode to UTF 8 Conversion program

Posted by Admin at 8:39 AM – 0 comments
 

It is a java program to convert unicode characters of input file to utf 8 encoding format.


import java.io.*;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;

public class Unicode2UTFConverter {
 /**
  * Creates a new UTF8-encoded byte array representing the
  * char[] passed in. The output array will NOT be null-terminated.
  *
  *
  * @param unicode An array of Unicode characters, which may have UCS4
  * characters encoded in UTF-16.  This array must not be null.
  * @exception CharConversionException If the input characters are invalid.
  */
 protected static byte[] UnicodeToUTF8(char[] unicode, boolean nullTerminate)
  throws CharConversionException
 {
  int uni; // unicode index
  int utf; // UTF8 index
  int maxsize; // maximum size of UTF8 output
  byte[] utf8 = null; // UTF8 output buffer
  byte[] temp = null; // used to create an array of the correct size
  char ch; // Unicode character
  int ucs; // UCS4 encoding of a character
  boolean failed = true;
  
 
  if(unicode == null) { return null;}

  try {

   // Allocate worst-case size (UTF8 bytes == 1.5 times Unicode bytes)
   maxsize = unicode.length * 3; //chars are 2 bytes each
   if(nullTerminate) {
    maxsize++;
   }
   utf8 = new byte[maxsize];
   
   for(uni=0, utf=0; uni < unicode.length; uni++) {
    
//    Convert UCS2 to UCS4  
//     Assuming that character may have UTF-16 encoding
    ch = unicode[uni];
    if( ch >= 0xd800 && ch <= 0xdbff) {
//     This is the high half of a UTF-16 char
     ucs = (ch-0xd800)<<10;

//     Now get the lower half
     if(uni == unicode.length-1) { throw new CharConversionException();} //There is no lower half
     
     ch = unicode[++uni];
     if(ch < 0xdc00 || ch > 0xdfff) {throw new CharConversionException();} // not in the low-half zone
     
     ucs |= ch-0xdc00;
     ucs += 0x00010000;
     
    } else if(ch >=0xdc00 && ch <=0xdfff) {throw new CharConversionException(); // orphaned low-half char 
    
    } else { ucs = unicode[uni]; // UCS2 char to UCS4
    }


    
//     UCS4 to UTF8 conversion
//    Note that the Standard UTF encoding is allowed till 4 bytes i.e < 10FFFF. However this program can encode till 6 bytes of unicode character
    
    if(ucs < 0x80) {
     // 0000 0000 - 0000 007f (ASCII)
     utf8[utf++] = (byte)ucs;
    } else if(ucs < 0x800) {
     // 0000 0080 - 0000 07ff
     utf8[utf++] = (byte) (0xc0 | ucs>>6);
     utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
    } else if(ucs < 0x0010000) {
     // 0000 0800 - 0000 ffff
     utf8[utf++] = (byte) (0xe0 | ucs>>12);
     utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
    } else if(ucs < 0x00200000) {
     // 001 0000 - 001f ffff
     utf8[utf++] = (byte) (0xf0 | ucs>>18);
     utf8[utf++] = (byte) (0x80 | ((ucs>>12) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
    } else if(ucs < 0x00200000) {
     // 0020 0000 - 03ff ffff
     utf8[utf++] = (byte) (0xf8 | ucs>>24);
     utf8[utf++] = (byte) (0x80 | ((ucs>>18) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | ((ucs>>12) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
     System.out.println(currentDate() + " :Warning: UTF-8 code for Unicode Character is Illegal");
    } else {
     // 0400 0000 - 7fff ffff
     utf8[utf++] = (byte) (0xfc | ucs>>30);
     utf8[utf++] = (byte) (0x80 | ((ucs>>24) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | ((ucs>>18) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | ((ucs>>12) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
     utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
     System.out.println(currentDate() + " :Warning: UTF-8 code for Unicode Character is Illegal");
    }
   
   }
   
   
   if(nullTerminate) { utf8[utf++] = (byte)0x0a; }// CR+LF
  
   // Copy into a correct-sized array
   
   try {
    int i;
    // last index is the size of the UTF8
    temp = new byte[utf];
    for(i=0; i < utf; i++) {
     temp[i] = utf8[i];
     utf8[i] = 0;
    }
    utf8 = temp;
    temp = null;
   } finally {
   }

   failed = false;
   return utf8;

  } finally {
   // Cleanup data locations where the password was written
   if(failed && utf8 != null)  {utf8 = null;}
  
   ucs = 0;
   ch = 0;
    }
 }
 /**
  * Main method
  */
 public static void main(String[] args) {

  char[] unicode;
  byte[] utf8;

  if (args.length !=2)
  {
   System.out.println("Usage: java UnicodeToUTF8 <input unicode filename> <output utf8 filename>");
   System.exit(0);
  }
  String InFilePath = args[0]; //Input filename is first argument.
  String OutFilePath = args[1]; //Output filename is Second argument.
  System.out.println(currentDate() +" : Starting Unicode to UTF8 Conversion");
  
  try{
      BufferedReader lin= new BufferedReader(new InputStreamReader
        (new FileInputStream(InFilePath)));
        
      FileOutputStream fos = new FileOutputStream(OutFilePath);
 
      String ls = new String(); // A temp val to hold each line.
    
       while((ls = lin.readLine()) != null)
       {
         unicode = ls.toCharArray();
        utf8 = UnicodeToUTF8(unicode,true);
        
         fos.write(utf8);
       }
       lin.close();
       fos.close();
       System.out.println(currentDate() +" : Unicode to UTF8 Conversion Successful");
       
   } catch(CharConversionException e) {
    System.out.println("Error converting Unicode "+e);
   }
      catch(Exception e){e.printStackTrace();}
  }
 
 private static String currentDate(){
  DateFormat shortFormatter = SimpleDateFormat.getDateTimeInstance( SimpleDateFormat.SHORT,
                SimpleDateFormat.MEDIUM  );

  long currentTimeInMillis = System.currentTimeMillis();
     Date today = new Date( currentTimeInMillis);
     return shortFormatter.format( today ).toString();
 }
}


Leave a Reply

Newer Post Older Post
Subscribe to: Post Comments ( Atom )
  • Popular
  • Recent
  • Archives
Powered by Blogger.
 
 
 
© 2011 Java Programs and Examples with Output | Designs by Web2feel & Fab Themes

Bloggerized by DheTemplate.com - Main Blogger