Java-解析搜狗输入法分类词库scel文件

scel文件是搜狗输入法自定义的一种分类词库,可下载并导入搜狗输入法中

下面对scel文件进行解析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import java.io.File;
import java.io.RandomAccessFile;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

public class SCEL {
File p;
int wordCount = 0;
Map<Integer,String> dict = new HashMap<Integer,String>();
Map<String,LinkedList<String>> wordList = new HashMap<String,LinkedList<String>>();

SCEL(String path){
p = new File(path);
}

void printWordListwithPinyin(){
for(String w:wordList.keySet()){
System.out.println(w + Arrays.asList(wordList.get(w).toArray()));
}
}

void printWordList(){
for(String w:wordList.keySet())
System.out.println(w);
}

public void parse() throws Exception{

RandomAccessFile raf = new RandomAccessFile(p,"r");

byte[] str = new byte[128];
int hzPosition = 0;
raf.read(str, 0, 128); // \x40\x15\x00\x00\x44\x43\x53\x01

if (str[4] == 0x44)
{
hzPosition = 0x2628;
}
if (str[4] == 0x45)
{
hzPosition = 0x26C4;
}

//get word count at 0x124
raf.seek(0x124);
wordCount = readInt(raf);

//get pinyin position
raf.seek(0x1544);

while(true){
byte[] num = new byte[4];
raf.read(num, 0, 4);
int mark = num[0] + num[1]*256;
byte[] buff = new byte[20];
raf.read(buff,0,num[2]);
String py = getString(buff,num[2]);
dict.put(mark, py);
if(py.equals("zuo")){
break;
}
}

//get hanzi position
raf.seek(hzPosition);

while(true){
byte[] num = new byte[4];
raf.read(num, 0, 4);
int samePYcount = num[0] + num[1]*256;
int count = num[2] + num[3]*256;

byte[] buff = new byte[256];
for (int i = 0; i < count; i++)
buff[i] = raf.readByte();

List<String> wordPY = new LinkedList<String>();
for (int i = 0; i < count/2; i++)
{
int key = buff[i*2] + buff[i*2 + 1]*256;
wordPY.add(dict.get(key));
}
for (int s = 0; s < samePYcount; s++){ //ͬ���ʣ�ʹ��ǰ����ͬ��ƴ��
raf.read(num,0,2);
int hzBytecount = num[0] + num[1]*256;
//System.out.println("hzBytecount:" + hzBytecount);
raf.read(buff,0,hzBytecount);
String word = getString(buff,hzBytecount);
//System.out.println(word);
raf.readShort();
raf.readInt();
wordList.put(word, (LinkedList<String>) wordPY);

for(int i=0; i<6 ;i++){
raf.readByte();
}
}
if(raf.getFilePointer() == raf.length())
break;
}
raf.close();
}

private int readInt(RandomAccessFile raf) throws Exception{
byte[] buff = new byte[4];
raf.read(buff, 0, 4);
return (int)buff[0]& 0xFF + (((int)buff[1]& 0xFF)<<8) + (((int)buff[2]& 0xFF)<<16) | (((int)buff[3]& 0xFF)<<24);
}
private String getString(byte[] buff,int num) throws Exception{
String str = new String(buff,0,num,"UTF-16LE");
return str;
}

}

调用Demo:

1
2
3
4
5
6
7
try {
SCEL scel = new SCEL("/Users/YI/Downloads/深圳地名.scel");
scel.parse();
scel.printWordListwithPinyin();
} catch (Exception e) {
e.printStackTrace();
}

可以看到解析后的词组:

Sougou-Scel