bdict格式是百度输入法的词库文件,那么怎么看里面的内容呢? 这就需要用到bdict到txt的转化。
下载:百度输入法-词库列表-餐饮
比如下载“菜名大全” 文件名:dict_file_734_20111227170031_1.0.0.bdict
文件大小134924字节
转换代码参考这篇文档:【搜狗&百度词库】.bdict文件与.scel转txt_scel文件在线-CSDN博客
import struct
import binasciiclass Baidu(object):def __init__(self, originfile):self.originfile = originfileself.lefile = originfile + '.le'self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'self.buf = [b'0' for x in range(0,2)]self.listwords = [] # 字节流大端转小端def be2le(self):of = open(self.originfile,'rb')lef = open(self.lefile, 'wb')contents = of.read()contents_size = contents.__len__()mo_size = (contents_size % 2)# 保证是偶数if mo_size > 0:contents_size += (2-mo_size)contents += contents + b'0000'# 大小端交换for i in range(0, contents_size, 2):self.buf[1] = contents[i]self.buf[0] = contents[i+1]le_bytes = struct.pack('2B', self.buf[0], self.buf[1])lef.write(le_bytes)print('写入成功转为小端的字节流')of.close()lef.close()def le2txt(self):lef = open(self.lefile, 'rb')txtf = open(self.txtfile, 'w')# 以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350le_bytes = lef.read().hex()[0x350:]i = 0while i<len(le_bytes):result = le_bytes[i:i+4]i+=4# 将所有字符解码成汉字,拼音或字符content = binascii.a2b_hex(result).decode('utf-16-be')# 判断汉字if '\u4e00' <= content <= '\u9fff':self.listwords.append(content)else:if self.listwords:word = ''.join(self.listwords)txtf.write(word + '\n')self.listwords = []print('写入txt成功')lef.close()txtf.close()
if __name__ == '__main__':path = '你的.bdict文件'bd = Baidu(path)bd.be2le()bd.le2txt()
略微修改了,提高了效率,降低了磁盘写操作频率,可以在命令行执行时跟需要转换的文件名:
import struct
import binascii
import sysclass Baidu(object):def __init__(self, originfile):self.originfile = originfileself.lefile = originfile + '.le'self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'self.buf = [b'0' for x in range(0,2)]self.listwords = [] # 字节流大端转小端def be2le(self):of = open(self.originfile,'rb')lef = open(self.lefile, 'wb')contents = of.read()contents_size = contents.__len__()mo_size = (contents_size % 2)
# print("====mozie", mo_size)# 保证是偶数if mo_size > 0:contents_size += (2-mo_size)contents += contents + b'0000'mo_size = (contents_size % 2)
# print("====mozie", mo_size)# 大小端交换tmp = b""for i in range(0, contents_size, 2):self.buf[1] = contents[i]self.buf[0] = contents[i+1]le_bytes = struct.pack('2B', self.buf[0], self.buf[1])tmp = tmp + le_byteslef.write(tmp)print('写入成功转为小端的字节流')of.close()lef.close()def le2txt(self):lef = open(self.lefile, 'rb')txtf = open(self.txtfile, 'w')# 以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350le_bytes = lef.read().hex()[0x350:]print(f'====len bytes of lefile:{len(le_bytes)}')i = 0tmpword = ""while i<len(le_bytes):result = le_bytes[i:i+4]i+=4# 将所有字符解码成汉字,拼音或字符content = binascii.a2b_hex(result).decode('utf-16-be')
# print(content)# 判断汉字if '\u4e00' <= content <= '\u9fff':self.listwords.append(content)else:if self.listwords:word = ''.join(self.listwords)# txtf.write(word + '\n')word = word + '\n'tmpword = tmpword + wordself.listwords = []# print(tmpword)txtf.write(tmpword)print('写入txt成功')lef.close()txtf.close()
if __name__ == '__main__':try:file_path = sys.argv[1]path = file_pathexcept :path = '8food.bdict'bd = Baidu(path)bd.be2le()bd.le2txt()
存盘为bd.py文件
执行转换
python bd.py
程序会自动读取8food.bdict文件,并转为8food.txt文件
也可以跟文件名,比如下载的8大菜系文件是 dict_file_734_20111227170031_1.0.0.bdict:
python bd.py dict_file_734_20111227170031_1.0.0.bdict
生成文件为dict_file_734_20111227170031_1.0.0.txt,内容为菜名:
阿胶炖肉
阿胶牛肉汤
阿胶养阴粥
阿胶养阴粥
安阳三熏
鹌鹑蛋烧稚
鹌鹑枸杞粥
鹌鹑枸杞粥
熬黄花鱼
八宝菠菜
这样百度输入法词库就转换为txt文件了!