jtyoui.word.TS 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2019/6/18 18:01
# @Author: Jtyoui@qq.com
from jtyoui.tools import pips
from jtyoui.data import word_nature


[文档]class TextSummary:
    def __init__(self, text, title):
        self.title = title
        self.text = text
        self.keywords = list()
        self.sentences = list()
        self.summary = list()
        self.cut = pips('jieba.posseg', 'jieba').cut  # 自动安装结巴包失败，请手动安装: pip install jieba
        self.extract_tags = pips('jieba.analyse', 'jieba').extract_tags  # 自动安装结巴包失败，请手动安装: pip install jieba

    def __setitem__(self, title, text):
        self.title = title
        self.text = text

    def _split_sentence(self):
        # 通过换行符对文档进行分段
        sections = self.text.split('\n')
        for section in sections:
            if section == '':
                sections.remove(section)

        # 通过分割符对每个段落进行分句
        for i in range(len(sections)):
            section = sections[i]
            text = ''
            k = 0
            for j in range(len(section)):
                char = section[j]
                text = text + char
                if char in '!。？' or j == len(section) - 1:
                    text = text.strip()
                    sentence = dict()
                    sentence['text'] = text
                    sentence['pos'] = dict()
                    sentence['pos']['x'] = i
                    sentence['pos']['y'] = k
                    # 将处理结果加入self.sentences
                    self.sentences.append(sentence)
                    text = ''
                    k = k + 1

        for sentence in self.sentences:
            sentence['text'] = sentence['text'].strip()
            if sentence['text'] == '':
                self.sentences.remove(sentence)

        # 对文章位置进行标注，通过mark列表，标注出是否是第一段、尾段、第一句、最后一句
        last_pos = dict()
        last_pos['x'] = 0
        last_pos['y'] = 0
        last_pos['mark'] = list()
        for sentence in self.sentences:
            pos = sentence['pos']
            pos['mark'] = list()
            if pos['x'] == 0:
                pos['mark'].append('first_section')
            if pos['y'] == 0:
                pos['mark'].append('first_sentence')
                last_pos['mark'].append('last_sentence')
            if pos['x'] == self.sentences[len(self.sentences) - 1]['pos']['x']:
                pos['mark'].append('last_section')
            last_pos = pos
        last_pos['mark'].append('last_sentence')

    def _calc_keywords(self):
        # 计算TF-IDF，取出排名靠前的20个词
        words_best = self.extract_tags(self.text, topK=20)
        # 提取第一段的关键词
        parts = self.text.lstrip().split('\n')
        first_part = ''
        if len(parts) >= 1:
            first_part = parts[0]
        words_best = words_best + self.extract_tags(first_part, topK=5)
        # 提取title中的关键词
        words_best = words_best + self.extract_tags(self.title, topK=3)
        # 将结果合并成一个句子，并进行分词
        text = ''
        for w in words_best:
            text += ' ' + w
        # 计算词性，提取名词和动词
        words = self.cut(text)
        keywords = list()
        for w in words:
            flag = w.flag
            word = w.word
            if flag.find('n') >= 0 or flag.find('v') >= 0:
                if len(word) > 1:
                    keywords.append(word)
        # 保留前20个关键词
        keywords = self.extract_tags(' '.join(keywords), topK=20)
        keywords = list(set(keywords))
        self.keywords = keywords

    def _calc_sentence_weight_by_keywords(self):
        # 计算句子的关键词权重
        for sentence in self.sentences:
            sentence['weightKeywords'] = 0
        for keyword in self.keywords:
            for sentence in self.sentences:
                if sentence['text'].find(keyword) >= 0:
                    sentence['weightKeywords'] += 1

    def _calc_sentence_weight_by_pos(self):
        # 计算句子的位置权重
        for sentence in self.sentences:
            mark = sentence['pos']['mark']
            weight_pos = 0
            if 'first_section' in mark:
                weight_pos = weight_pos + 2
            if 'first_sentence' in mark:
                weight_pos = weight_pos + 2
            if 'last_sentence' in mark:
                weight_pos = weight_pos + 1
            if 'last_section' in mark:
                weight_pos = weight_pos + 1
            sentence['weight_pos'] = weight_pos

    def _calc_sentence_weight_cue_words(self, index=None):
        # 计算句子的线索词权重
        if not index:
            index = word_nature()['adv_freq']
        for sentence in self.sentences:
            sentence['weightCueWords'] = 0
        for i in index:
            for sentence in self.sentences:
                if sentence['text'].find(i) >= 0:
                    sentence['weightCueWords'] = 1

    def _calc_sentence_weight(self, index=None):
        self._calc_sentence_weight_by_pos()
        self._calc_sentence_weight_cue_words(index)
        self._calc_sentence_weight_by_keywords()
        for sentence in self.sentences:
            sentence['weight'] = sentence['weight_pos'] + 2 * sentence['weightCueWords'] + sentence['weightKeywords']

[文档]    def calc_summary(self, ratio=0.1, index=None):
        # 清空变量
        self.keywords.clear()
        self.sentences.clear()
        self.summary.clear()

        # 调用方法，分别计算关键词、分句，计算权重
        self._calc_keywords()
        self._split_sentence()
        self._calc_sentence_weight(index)

        # 对句子的权重值进行排序
        self.sentences = sorted(self.sentences, key=lambda k: k['weight'], reverse=True)

        # 根据排序结果，取排名占前X%的句子作为摘要
        for i in range(len(self.sentences)):
            if i < ratio * len(self.sentences):
                sentence = self.sentences[i]
                self.summary.append(sentence['text'])

        return self.summary


if __name__ == '__main__':
    data = """6月17日22时55分，四川长宁县发生6.0级地震，震源深度16千米。地震发生两个小时后，离震中较近的四川省宜宾市珙县
    巡场镇宜宾市矿山急救医院迎来第一个新生儿。医生在余震和医院房屋出现损毁的情况下顶住压力和风险，为产妇接生，母子平安。"""
    ts = TextSummary(data, title='长宁县地震')
    print(ts.calc_summary())