jtyoui.word.NAA 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2018/2/28 10:58
# @Author: Jtyoui@qq.com
import math

"""
NAA(New Word Analysis Algorithms):分析新词算法
"""


[文档]class NAA: all_lens = 0.0 naa_words = dict() def __init__(self, split_num=4, seq=0.001, cond=50, free=0.5): """初始化参数 :param split_num: 匹配个数 :param seq: 关键字出现的频率 :param cond: 凝固程度 :param free: 自由程度 """ self.split_num = split_num self.seq = seq self.cond = cond self.free = free
[文档] def read_ls(self, ls): st = ''.join(ls) self.all_lens = len(st) for word in ls: self.split(words=word) print("split over!")
def __read_ls(self, ls): """读取链表,格式是[str,str,str,.....]一维链表 :param ls: 链表 """ for word in ls: self.split(words=word) print("split over!")
[文档] def read_string(self, st, split_seq='\n'): """讲字符按照split_seq格式来分割 :param st: 字符串 :param split_seq: 字符分割 :return: None """ self.all_lens = len(st) ls = st.split(split_seq) print("reader over!") self.__read_ls(ls=ls)
[文档] def read_file(self, file, file_encoding='utf-8-sig'): """读取文件内容,注意文件是UTF-8的格式且不是BOM格式 :param file: 读取的文件 :param file_encoding: 文本编码 """ fp = open(file, "r", encoding=file_encoding) d = fp.read() self.read_string(st=d) fp.close()
[文档] def split(self, words): """拆分字符,最大匹配num个字符,并也字典的形式返回 [出现次数,出现频率,凝固程度,自由程度,关键字的左邻,关键字的右邻](作为信息熵的衡量) """ lens = len(words) for i in range(0, lens): for j in range(1, self.split_num + 1): if i + j < lens: k = words[i:i + j] word = self.naa_words.get(k) if word: word[0] += 1 word[1] = word[0] / self.all_lens word[4].append(words[i - 1]) word[5].append(words[i + 1]) else: self.naa_words[k] = [1, 1 / self.all_lens, 1, 0, [words[i - 1]], [words[i + 1]]]
[文档] def handle(self): """处理数据 计算左邻字集合和右邻字集合有多随机,左邻字信息熵和右邻字信息熵中的较小值 计算凝固程度,自由程度 """ for k in self.naa_words: if len(k) == 1: continue word = self.naa_words[k] end_all = front_all = 0.0 left = word[1] / (self.naa_words[k[0]][1] * self.naa_words[k[1:]][1]) right = word[1] / (self.naa_words[k[-1]][1] * self.naa_words[k[:-1]][1]) for front in word[4]: if front in self.naa_words: front_all -= math.log(self.naa_words[front][1]) * self.naa_words[front][1] for end in word[5]: if end in self.naa_words: end_all -= math.log(self.naa_words[end][1]) * self.naa_words[end][1] word[2] = left if left < right else right word[3] = front_all if front_all < end_all else end_all print("handle over!")
[文档] def filter(self, filter_cond=10, filter_free=0.5, flag=False): """过滤一些不重要的数据 [出现次数,出现频率,凝固程度,自由程度] :param filter_cond: 过滤凝聚度 :param filter_free: 过滤自由度 :param flag: 是否是并且还是或者,默认是或者,满足一个就过滤 :return: 过滤后的数据字典 """ key_words = dict() for k in self.naa_words: if len(k) <= 1: continue splits = self.naa_words[k] if splits[1] > self.seq and splits[2] > self.cond and splits[3] > self.free: key_words[k] = [splits[0], splits[1], splits[2], splits[3]] new_key = key_words.copy() keys = key_words.keys() for max_key in keys: """ 去除关键字像:中国新时,这种不能分成更多的词语 """ lens = len(max_key) if lens > 4: i = 2 split_flag = 0 while i <= lens: if max_key[split_flag:i] in keys and max_key[split_flag:i] != max_key: split_flag = i i += 1 if split_flag != lens: new_key.pop(max_key) continue for min_key in keys: """ 去除子集 """ max_keys = key_words[max_key] if max_key in min_key and max_key != min_key: if max_keys[0] <= key_words[min_key][0] + 5: new_key.pop(max_key) break else: if flag: if max_keys[2] < filter_cond and max_keys[3] < filter_free: new_key.pop(max_key) break elif max_keys[2] < filter_cond or max_keys[3] < filter_free: new_key.pop(max_key) break print("filter over!") return new_key
if __name__ == '__main__': naa = NAA(split_num=10, seq=0.0001) naa.read_file(file=r"D:\data.txt") naa.handle() data = naa.filter(filter_free=0.7, filter_cond=84, flag=False) for key, value in data.items(): print(F'关键字:{key},次数:{value[0]},频率:{value[1]},凝聚度:{value[2]},自由度:{(value[3])}')