Java-获取文件的编码

获取任意文件的编码格式。

首先,需要获取Java系统支持的所有编码集:

1
Set<String> charsetNames = Charset.availableCharsets().keySet();

然后通过读取文件的一小部分字节,对应每个编码类型逐一进行匹配,最终得出文件的编码,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Set;

public class CharsetDetector {

public Charset detectCharset(File f, String[] charsets) {

Charset charset = null;

for (String charsetName : charsets) {
charset = detectCharset(f, Charset.forName(charsetName));
if (charset != null) {
break;
}
}

return charset;
}

private Charset detectCharset(File f, Charset charset) {
try {
BufferedInputStream input = new BufferedInputStream(new FileInputStream(f));

CharsetDecoder decoder = charset.newDecoder();
decoder.reset();

byte[] buffer = new byte[512];
boolean identified = false;
while ((input.read(buffer) != -1) && (!identified)) {
identified = identify(buffer, decoder);
}

input.close();

if (identified) {
return charset;
} else {
return null;
}

} catch (Exception e) {
return null;
}
}

private boolean identify(byte[] bytes, CharsetDecoder decoder) {
try {
decoder.decode(ByteBuffer.wrap(bytes));
} catch (CharacterCodingException e) {
return false;
}
return true;
}

public static void main(String[] args) {
try{
File f = new File("/Users/YI/Desktop/audio.pcm");

Set<String> charsetNames = Charset.availableCharsets().keySet();
String[] tmp = new String[charsetNames.size()];
String[] charsetsToBeTested = charsetNames.toArray(tmp);

CharsetDetector cd = new CharsetDetector();
Charset charset = cd.detectCharset(f, charsetsToBeTested);

if (charset != null) {
System.out.println(charset.toString());
FileInputStream is = new FileInputStream(f);
while(true){
byte[] by = new byte[1024];
int read = is.read(by);
if(read == -1)
break;
}

}else{
System.out.println("Unrecognized charset.");
}
}catch(Exception e){

}
}
}