jtyoui.word.neologism 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time    : 2018/2/28 10:58
# @Email  : jtyoui@qq.com
# @Software: PyCharm
import math
import re
from jtyoui.regular import Non_Chinese
from jtyoui.decorators import replace_regular
from collections import Counter
import os


[文档]@replace_regular(Non_Chinese, '') def clean(data): # 去除非中文字符 return data, len(data)
[文档]class Neologism: def __init__(self): self.ALL_WORDS = dict() self.All_LENS = 0
[文档] def read_file(self, file, split_num): """按文件读取 :param file: 文件地址 :param split_num: 最大分割词语 """ if os.path.exists(file): with open(file, encoding='utf-8') as fp: for line in fp: self.read_string(line, split_num)
[文档] def read_string(self, st, split_num, split_seq='[,。!?:.,?]'): """按字符按照split_seq格式来分割 :param st: 字符串 :param split_num: 分词的个数 :param split_seq: 字符分割 """ ls = re.split(split_seq, st) self.read_ls(ls, split_num)
[文档] def read_ls(self, ls, split_num): """数据类型[str] :param ls: 表示链表 :param split_num: 分词的个数 """ for word in ls: self.All_LENS += len(word) clean_data, lens = clean(data=word) if lens > 2: self._split(clean_data, lens, split_num)
def _split(self, words, lens, split_num): """拆分字符,最大匹配num个字符,并也字典的形式返回, [出现次数,出现频率,凝固程度,自由程度,关键字的左邻,关键字的右邻](作为信息熵的衡量) """ for i in range(lens): for j in range(i + 1, split_num + 1 + i): if j < lens: key = words[i:j] word = self.ALL_WORDS.get(key) if word: word[0] += 1 word[4].append(words[i - 1]) word[5].append(words[j]) else: self.ALL_WORDS[key] = [1, 0.0, 1, 0, [words[i - 1]], [words[j]]]
[文档] def statistics(self): """统计每个单词的频率""" for key in self.ALL_WORDS: self.ALL_WORDS[key][1] = self.ALL_WORDS[key][0] / self.All_LENS
def _information_entropy(self, word_ls): """信息熵""" entropy_all = 0.0 key_count = Counter(word_ls) for key, count in key_count.items(): word = self.ALL_WORDS.get(key) if word: entropy_all -= math.log(word[1]) * word[1] * count # 邻字的信息熵 return entropy_all
[文档] def handle(self): """处理数据 计算左邻字集合和右邻字集合的频率,左邻字信息熵和右邻字信息熵中的较小值 计算凝固程度,自由程度 """ for key, word_list in self.ALL_WORDS.items(): if len(key) > 1: # 左邻字集合的凝聚度和右邻字集合的凝聚度相比较.谁越少说明该词语越容易接近谁 left = word_list[1] / (self.ALL_WORDS[key[0]][1] * self.ALL_WORDS[key[1:]][1]) # 左邻字集合的凝聚度 right = word_list[1] / (self.ALL_WORDS[key[-1]][1] * self.ALL_WORDS[key[:-1]][1]) # 右邻字集合的凝聚度 word_list[2] = left if left < right else right # 左邻字集合的信息熵和右邻字集合的信息熵的相比较.谁的信息熵越少说明该集合提供的信息越大 front_all = self._information_entropy(word_list[4]) # 左邻字集合的信息熵 end_all = self._information_entropy(word_list[5]) # 右邻字集合的信息熵 word_list[3] = front_all if front_all < end_all else end_all
[文档] def filter_words(self, count, frequency, cond, free): """过滤一些不重要的数据 [出现次数,出现频率,凝固程度,自由程度] :param count: key出现的次数 :param frequency: 过滤的频率 :param cond: 过滤凝聚度 :param free: 过滤自由度 :return: 过滤后的数据字典 """ ls = [] for key, one_word in self.ALL_WORDS.items(): if len(key) <= 1: continue if (one_word[0] > count or one_word[1] > frequency) and one_word[2] > cond and one_word[3] > free: ls.append((key, one_word)) return remove_subset(ls)
[文档]def remove_subset(ls: list) -> list: """去除列表中的子集。比如:['aa','a','ab'] --> ['aa','ab']""" ls.sort(key=lambda x: len(x[0]), reverse=True) total = [] for subset in ls: flag = True for word in total: if subset[0] in word[0]: flag = False break if flag: total.append(subset) return total
[文档]def mains(file, split_num, count, frequency, cond, free): """成词发现算法。保存结果在当前运行环境下的result.txt文件中 :param file: 文本地址 :param split_num: 成词最大粒度 :param count: key出现的次数 :param frequency: 过滤的频率 :param cond: 过滤凝聚度 :param free: 过滤自由度 """ wf = open('result.txt', 'w', encoding='utf-8') n = Neologism() n.read_file(file, split_num) n.statistics() n.handle() print('正在保存文件数据:result.txt') for k, v in n.filter_words(count=count, frequency=frequency, cond=cond, free=free): s = F'关键字:{k} 次数:{v[0]} 频率:{v[1]} 凝聚度:{v[2]} 自由度:{v[3]}' wf.write(s + '\n') wf.flush() wf.close() print('保存完毕')
if __name__ == '__main__': mains(r'D:\data.txt', 6, count=10, frequency=0.0001, cond=84, free=0.7)