jtyoui.word.TS 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2019/6/18 18:01
# @Author: Jtyoui@qq.com
from jtyoui.tools import pips
from jtyoui.data import word_nature


[文档]class TextSummary: def __init__(self, text, title): self.title = title self.text = text self.keywords = list() self.sentences = list() self.summary = list() self.cut = pips('jieba.posseg', 'jieba').cut # 自动安装结巴包失败,请手动安装: pip install jieba self.extract_tags = pips('jieba.analyse', 'jieba').extract_tags # 自动安装结巴包失败,请手动安装: pip install jieba def __setitem__(self, title, text): self.title = title self.text = text def _split_sentence(self): # 通过换行符对文档进行分段 sections = self.text.split('\n') for section in sections: if section == '': sections.remove(section) # 通过分割符对每个段落进行分句 for i in range(len(sections)): section = sections[i] text = '' k = 0 for j in range(len(section)): char = section[j] text = text + char if char in '!。?' or j == len(section) - 1: text = text.strip() sentence = dict() sentence['text'] = text sentence['pos'] = dict() sentence['pos']['x'] = i sentence['pos']['y'] = k # 将处理结果加入self.sentences self.sentences.append(sentence) text = '' k = k + 1 for sentence in self.sentences: sentence['text'] = sentence['text'].strip() if sentence['text'] == '': self.sentences.remove(sentence) # 对文章位置进行标注,通过mark列表,标注出是否是第一段、尾段、第一句、最后一句 last_pos = dict() last_pos['x'] = 0 last_pos['y'] = 0 last_pos['mark'] = list() for sentence in self.sentences: pos = sentence['pos'] pos['mark'] = list() if pos['x'] == 0: pos['mark'].append('first_section') if pos['y'] == 0: pos['mark'].append('first_sentence') last_pos['mark'].append('last_sentence') if pos['x'] == self.sentences[len(self.sentences) - 1]['pos']['x']: pos['mark'].append('last_section') last_pos = pos last_pos['mark'].append('last_sentence') def _calc_keywords(self): # 计算TF-IDF,取出排名靠前的20个词 words_best = self.extract_tags(self.text, topK=20) # 提取第一段的关键词 parts = self.text.lstrip().split('\n') first_part = '' if len(parts) >= 1: first_part = parts[0] words_best = words_best + self.extract_tags(first_part, topK=5) # 提取title中的关键词 words_best = words_best + self.extract_tags(self.title, topK=3) # 将结果合并成一个句子,并进行分词 text = '' for w in words_best: text += ' ' + w # 计算词性,提取名词和动词 words = self.cut(text) keywords = list() for w in words: flag = w.flag word = w.word if flag.find('n') >= 0 or flag.find('v') >= 0: if len(word) > 1: keywords.append(word) # 保留前20个关键词 keywords = self.extract_tags(' '.join(keywords), topK=20) keywords = list(set(keywords)) self.keywords = keywords def _calc_sentence_weight_by_keywords(self): # 计算句子的关键词权重 for sentence in self.sentences: sentence['weightKeywords'] = 0 for keyword in self.keywords: for sentence in self.sentences: if sentence['text'].find(keyword) >= 0: sentence['weightKeywords'] += 1 def _calc_sentence_weight_by_pos(self): # 计算句子的位置权重 for sentence in self.sentences: mark = sentence['pos']['mark'] weight_pos = 0 if 'first_section' in mark: weight_pos = weight_pos + 2 if 'first_sentence' in mark: weight_pos = weight_pos + 2 if 'last_sentence' in mark: weight_pos = weight_pos + 1 if 'last_section' in mark: weight_pos = weight_pos + 1 sentence['weight_pos'] = weight_pos def _calc_sentence_weight_cue_words(self, index=None): # 计算句子的线索词权重 if not index: index = word_nature()['adv_freq'] for sentence in self.sentences: sentence['weightCueWords'] = 0 for i in index: for sentence in self.sentences: if sentence['text'].find(i) >= 0: sentence['weightCueWords'] = 1 def _calc_sentence_weight(self, index=None): self._calc_sentence_weight_by_pos() self._calc_sentence_weight_cue_words(index) self._calc_sentence_weight_by_keywords() for sentence in self.sentences: sentence['weight'] = sentence['weight_pos'] + 2 * sentence['weightCueWords'] + sentence['weightKeywords']
[文档] def calc_summary(self, ratio=0.1, index=None): # 清空变量 self.keywords.clear() self.sentences.clear() self.summary.clear() # 调用方法,分别计算关键词、分句,计算权重 self._calc_keywords() self._split_sentence() self._calc_sentence_weight(index) # 对句子的权重值进行排序 self.sentences = sorted(self.sentences, key=lambda k: k['weight'], reverse=True) # 根据排序结果,取排名占前X%的句子作为摘要 for i in range(len(self.sentences)): if i < ratio * len(self.sentences): sentence = self.sentences[i] self.summary.append(sentence['text']) return self.summary
if __name__ == '__main__': data = """6月17日22时55分,四川长宁县发生6.0级地震,震源深度16千米。地震发生两个小时后,离震中较近的四川省宜宾市珙县 巡场镇宜宾市矿山急救医院迎来第一个新生儿。医生在余震和医院房屋出现损毁的情况下顶住压力和风险,为产妇接生,母子平安。""" ts = TextSummary(data, title='长宁县地震') print(ts.calc_summary())