StarDict 词典idx文件格式:
每一个条目在单词列表中包含陆续三个域:
word_str; // a utf-8 string terminated by '\0'.
// 一个 utf-8 编码字符串,以 '\0' 终止符结束。word_str 的长度将是小于 256 的
word_data_offset; // word data's offset in .dict file
// 单词数据在 .dict 文件中的偏移,
//If the version is "3.0.0" and "idxoffsetbits=64",
//word_data_offset will be 64-bits unsigned number in network byte order.
word_data_size; // word data's total size in .dict file
// 单词数据在 .dict 文件中的总大小,word_data_size should be 32-bits unsigned number
// in network byte order.
package main import ( // "bufio" "io" "os" "fmt" "strconv" ) func main() { fi, err := os.Open("gaojihanyudacidian_fix.idx")/*现代汉语词典\\*/ if err != nil { panic(err) } defer fi.Close() fo, err := os.Create("output.txt") if err != nil { panic(err) } defer fo.Close() /*每次只读4个字节*/ read_buf := make([]byte,4) jiange := make([]byte,1) huanghuang := make([]byte,1) jiange[0]=9 huanghuang[0]=10 var pos,nextPos uint64 = 0,0 /*标记当下需要读取的是哪个域的数据 为1是word_str 为2是word_data_offset 为3是word_data_size*/ var setp int = 1 /*记录已经存储在word_str中的字符个数(不包括字符串最后的\0),也就是说下次从word_str[lenth_of_word_str]处开始存*/ var lenth_of_word_str,charNum int = 0,0 word_str := make([]byte,257) var tmpChar byte =0 var word_data_offset uint64 = 0 var word_data_size uint64 = 0 count :=1 for{ pos=nextPos count++ n,err := fi.ReadAt(read_buf,(int64)(pos)) if err != nil && err != io.EOF{ panic(err) } /*文件格式要求:读取数据小于4个字节时说明文件结束*/ if n < 4{ fmt.Printf("\nfinish read\n") break } switch setp { case 1: // fmt.Println("1:") tmpChar=read_buf[0] /*如果第一个字符是‘\0’说明该步骤已经结束*/ if tmpChar != 0{ /*因为可能有逗号,所以可能只有1个或者俩个字节组成一个字符*/ if tmpChar < 128{ charNum=1/*charNum记录utf8编码的字符数*/ }else if tmpChar < 194{ panic(err) }else if tmpChar < 224{ charNum=2 }else if tmpChar < 240{ charNum=3 }else{ panic(err) } read_buf[charNum]=0 str1 := (string)(read_buf[0:charNum+1]) copy(word_str[lenth_of_word_str:lenth_of_word_str+charNum], read_buf[:charNum]) lenth_of_word_str=lenth_of_word_str+charNum nextPos=nextPos+(uint64)(charNum) continue }else{ word_str[lenth_of_word_str]=9 nextPos=nextPos+1 } case 2: word_data_offset =0 word_data_offset = word_data_offset+((uint64)(read_buf[0]))*16*16*16*16*16*16 word_data_offset = word_data_offset+((uint64)(read_buf[1]))*16*16*16*16 word_data_offset = word_data_offset+((uint64)(read_buf[2]))*16*16 word_data_offset = word_data_offset+(uint64)(read_buf[3]) nextPos=nextPos+4 // fmt.Printf("word_data_offset =%d \n",word_data_offset/*(uint64)(read_buf[3])*/) case 3: word_data_size =0 word_data_size = word_data_size+(uint64)(read_buf[0])*16*16*16*16*16*16 word_data_size = word_data_size+(uint64)(read_buf[1])*16*16*16*16 word_data_size = word_data_size+(uint64)(read_buf[2])*16*16 word_data_size = word_data_size+(uint64)(read_buf[3]) nextPos=nextPos+4 default: } /*现在可以写入了*/ if setp == 3{ if _,err := fo.Write( word_str[:lenth_of_word_str+1]); err != nil{ panic(err) } word_data_offset_str := strconv.FormatUint(word_data_offset,10) word_data_size_str := strconv.FormatUint(word_data_size,10) if _,err := fo.WriteString( word_data_offset_str); err != nil{ panic(err) } if _,err := fo.Write( jiange[0:1]); err != nil{ panic(err) } if _,err := fo.WriteString( word_data_size_str); err != nil{ panic(err) } if _,err := fo.Write( huanghuang[0:1]); err != nil{ panic(err) } lenth_of_word_str=0 } setp=setp+1 if setp > 4{ setp = 1 } } }
有疑问加站长微信联系(非本文作者)