Java-解析搜狗输入法核心词库sgim_core.bin文件

sgim_core.bin文件是搜狗输入法的核心词库。

下面对sgim_core.bin文件进行解析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;


public class Sougou {
public static void main(final String[] args) throws IOException {
final String binFile = "/Users/YI/Documents/workspace/TestWordLib/src/db/sgim_core.bin";
final int[] searchKey = { 0x02, 0x00, 0x4A, 0x55 };

final ByteBuffer bb;
try (RandomAccessFile file = new RandomAccessFile(binFile, "r"); final FileChannel fChannel = file.getChannel();) {
bb = ByteBuffer.allocate((int) fChannel.size());
fChannel.read(bb);
}

bb.order(ByteOrder.LITTLE_ENDIAN);
bb.rewind();

int words = bb.getInt(0xC);
System.out.println("读入文件: " + binFile + ",单词:" + words);

int idx = 0;
int i;
int startPos = -1;
while (bb.hasRemaining()) {
i = 0xff & bb.get();
if (i == searchKey[idx]) {
idx++;
if (idx == searchKey.length) {
startPos = bb.position() - searchKey.length;
break;
}
} else {
idx = 0;
}
}

if (startPos != -1) {
short s;
int counter = 0;
final ByteBuffer buffer = ByteBuffer.allocate(Short.MAX_VALUE);
System.out.println("单词起始位置:0x" + Integer.toHexString(startPos));
bb.position(startPos);
while (bb.hasRemaining() && (words-- > 0)) {
s = bb.getShort();
bb.get(buffer.array(), 0, s);
counter++;
String word = new String(buffer.array(), 0, s, "UTF-16LE");
if(word.length() > 1){
System.out.println(word);
}
}
final int endPos = bb.position();
final int diff = endPos - startPos;
System.out.println("读出单词'" + binFile + "':" + counter);
System.out.println("单词结尾位置:0x" + Integer.toHexString(endPos));
System.out.println("单词词典长度:0x" + Integer.toHexString(diff));
} else {
System.err.println("文件版本已更新!");
}

}
}

运行后可以看到解析后的词组:

Sougou-Corebin