It is a java program to convert unicode characters of input file to utf 8 encoding format.
import java.io.*;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
public class Unicode2UTFConverter {
/**
* Creates a new UTF8-encoded byte array representing the
* char[] passed in. The output array will NOT be null-terminated.
*
*
* @param unicode An array of Unicode characters, which may have UCS4
* characters encoded in UTF-16. This array must not be null.
* @exception CharConversionException If the input characters are invalid.
*/
protected static byte[] UnicodeToUTF8(char[] unicode, boolean nullTerminate)
throws CharConversionException
{
int uni; // unicode index
int utf; // UTF8 index
int maxsize; // maximum size of UTF8 output
byte[] utf8 = null; // UTF8 output buffer
byte[] temp = null; // used to create an array of the correct size
char ch; // Unicode character
int ucs; // UCS4 encoding of a character
boolean failed = true;
if(unicode == null) { return null;}
try {
// Allocate worst-case size (UTF8 bytes == 1.5 times Unicode bytes)
maxsize = unicode.length * 3; //chars are 2 bytes each
if(nullTerminate) {
maxsize++;
}
utf8 = new byte[maxsize];
for(uni=0, utf=0; uni < unicode.length; uni++) {
// Convert UCS2 to UCS4
// Assuming that character may have UTF-16 encoding
ch = unicode[uni];
if( ch >= 0xd800 && ch <= 0xdbff) {
// This is the high half of a UTF-16 char
ucs = (ch-0xd800)<<10;
// Now get the lower half
if(uni == unicode.length-1) { throw new CharConversionException();} //There is no lower half
ch = unicode[++uni];
if(ch < 0xdc00 || ch > 0xdfff) {throw new CharConversionException();} // not in the low-half zone
ucs |= ch-0xdc00;
ucs += 0x00010000;
} else if(ch >=0xdc00 && ch <=0xdfff) {throw new CharConversionException(); // orphaned low-half char
} else { ucs = unicode[uni]; // UCS2 char to UCS4
}
// UCS4 to UTF8 conversion
// Note that the Standard UTF encoding is allowed till 4 bytes i.e < 10FFFF. However this program can encode till 6 bytes of unicode character
if(ucs < 0x80) {
// 0000 0000 - 0000 007f (ASCII)
utf8[utf++] = (byte)ucs;
} else if(ucs < 0x800) {
// 0000 0080 - 0000 07ff
utf8[utf++] = (byte) (0xc0 | ucs>>6);
utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
} else if(ucs < 0x0010000) {
// 0000 0800 - 0000 ffff
utf8[utf++] = (byte) (0xe0 | ucs>>12);
utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
} else if(ucs < 0x00200000) {
// 001 0000 - 001f ffff
utf8[utf++] = (byte) (0xf0 | ucs>>18);
utf8[utf++] = (byte) (0x80 | ((ucs>>12) & 0x3f) );
utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
} else if(ucs < 0x00200000) {
// 0020 0000 - 03ff ffff
utf8[utf++] = (byte) (0xf8 | ucs>>24);
utf8[utf++] = (byte) (0x80 | ((ucs>>18) & 0x3f) );
utf8[utf++] = (byte) (0x80 | ((ucs>>12) & 0x3f) );
utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
System.out.println(currentDate() + " :Warning: UTF-8 code for Unicode Character is Illegal");
} else {
// 0400 0000 - 7fff ffff
utf8[utf++] = (byte) (0xfc | ucs>>30);
utf8[utf++] = (byte) (0x80 | ((ucs>>24) & 0x3f) );
utf8[utf++] = (byte) (0x80 | ((ucs>>18) & 0x3f) );
utf8[utf++] = (byte) (0x80 | ((ucs>>12) & 0x3f) );
utf8[utf++] = (byte) (0x80 | ((ucs>>6) & 0x3f) );
utf8[utf++] = (byte) (0x80 | (ucs & 0x3f) );
System.out.println(currentDate() + " :Warning: UTF-8 code for Unicode Character is Illegal");
}
}
if(nullTerminate) { utf8[utf++] = (byte)0x0a; }// CR+LF
// Copy into a correct-sized array
try {
int i;
// last index is the size of the UTF8
temp = new byte[utf];
for(i=0; i < utf; i++) {
temp[i] = utf8[i];
utf8[i] = 0;
}
utf8 = temp;
temp = null;
} finally {
}
failed = false;
return utf8;
} finally {
// Cleanup data locations where the password was written
if(failed && utf8 != null) {utf8 = null;}
ucs = 0;
ch = 0;
}
}
/**
* Main method
*/
public static void main(String[] args) {
char[] unicode;
byte[] utf8;
if (args.length !=2)
{
System.out.println("Usage: java UnicodeToUTF8 <input unicode filename> <output utf8 filename>");
System.exit(0);
}
String InFilePath = args[0]; //Input filename is first argument.
String OutFilePath = args[1]; //Output filename is Second argument.
System.out.println(currentDate() +" : Starting Unicode to UTF8 Conversion");
try{
BufferedReader lin= new BufferedReader(new InputStreamReader
(new FileInputStream(InFilePath)));
FileOutputStream fos = new FileOutputStream(OutFilePath);
String ls = new String(); // A temp val to hold each line.
while((ls = lin.readLine()) != null)
{
unicode = ls.toCharArray();
utf8 = UnicodeToUTF8(unicode,true);
fos.write(utf8);
}
lin.close();
fos.close();
System.out.println(currentDate() +" : Unicode to UTF8 Conversion Successful");
} catch(CharConversionException e) {
System.out.println("Error converting Unicode "+e);
}
catch(Exception e){e.printStackTrace();}
}
private static String currentDate(){
DateFormat shortFormatter = SimpleDateFormat.getDateTimeInstance( SimpleDateFormat.SHORT,
SimpleDateFormat.MEDIUM );
long currentTimeInMillis = System.currentTimeMillis();
Date today = new Date( currentTimeInMillis);
return shortFormatter.format( today ).toString();
}
}