最近做一个全文检索的功能,要求在用户上传附件时获取其内容存入数据库,CSV,EXCEL,WORD,PPT都搞定了。剩余一个最简单的TXT文档,开始忽略了它还有不同编码,导致一些编码格式的文本乱码。兼容写法如下,一个判断文件头的方法搞定所有。
privateString getCharset(String fileName) throws IOException{
BufferedInputStream bin = new BufferedInputStream(newFileInputStream(fileName));
int p = (bin.read() << 8) +bin.read();
String code = null;
switch (p) {
case 0xefbb:
code = "UTF-8";
break;
case 0xfffe:
code = "Unicode";
break;
case 0xfeff:
code = "UTF-16BE";
break;
default:
code = "GBK";
}
return code;
}
publicString getTextFromText(String filePath){
try {
InputStreamReader isr = new InputStreamReader(newFileInputStream(filePath),getCharset(filePath));
BufferedReader br = newBufferedReader(isr);
StringBuffer sb = new StringBuffer();
String temp = null;
while((temp = br.readLine()) != null){
sb.append(temp);
}
br.close();
return sb.toString();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}