jtyoui.baidu.Encyclopedias 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time    : 2019/4/28 21:56
# @Email  : jtyoui@qq.com
# @Software: PyCharm
from html.parser import HTMLParser
from urllib.parse import quote
import requests
from jtyoui.web import headers_ua
import re


class _InfoSearch(HTMLParser):
    """基类"""

    def __init__(self):
        HTMLParser.__init__(self)
        self._data_flag = ''
        self.desc = ''
        self._desc_flag = False
        self.info_name = []
        self.info_value = []
        self._info_name = False
        self._info_value = False
        self.info = {}

    def handle_starttag(self, tag, attrs):  # 开始标签
        if len(attrs) > 0:
            attr = attrs[0]
        else:
            return
        if tag == 'div' and 'lemma-summary' in attr:
            self._desc_flag = True
        elif tag == 'dt' and 'basicInfo-item name' in attr:
            self._info_name = True
        elif tag == 'dd' and 'basicInfo-item value' in attr:
            self._info_value = True
        elif tag == 'div' and 'configModuleBanner' in attr:
            self._desc_flag = False
        else:
            return
        self._data_flag = ''

    def handle_endtag(self, tag):  # 结束标签
        _data = self._data_flag.replace('\n', '').replace(u'\xa0', '')
        if tag == 'dt':
            self._info_name = False
            if _data:
                self.info_name.append(_data)
        elif tag == 'dd':
            self._info_value = False
            if _data:
                self.info_value.append(_data)

    def handle_data(self, data):  # 内容
        if self._desc_flag:
            self.desc += data
        elif self._info_name:
            self._data_flag += data
        elif self._info_value:
            self._data_flag += data

    def basic_info(self):  # 基本信息
        for k, v in zip(self.info_name, self.info_value):
            if k == v:
                continue
            self.info[k] = v
        return self.info

    def describe(self):  # 摘要
        return self.desc.replace('\n', '').replace(u'\xa0', '')


[文档]class BaiDuInfoSearch: """百度百科搜索基本信息 >>> bd = BaiDuInfoSearch('玛卡') >>> print(bd.desc()) >>> print(bd.info()) """ def __init__(self, data): if '<html>' not in data: data = Load_BaiDuBaiKe(data) self._BD = _InfoSearch() self._BD.feed(data)
[文档] def info(self): """基本信息""" info = self._BD.basic_info() info = re.sub(r'\[\d*-?\d*\]', '', str(info)) return eval(info)
[文档] def desc(self): """描述信息""" return re.sub(r'\[\d*-?\d*\]', '', self._BD.describe())
[文档]def Load_BaiDuBaiKe(name): """下载百度百科里面的内容信息 :param name: 百科百科名字 :return: 百度百科的文本信息 """ url = F'https://baike.baidu.com/item/{quote(name)}' response = requests.get(url, headers=headers_ua()) data = response.content.decode('utf-8') return data
if __name__ == '__main__': import pprint bd = BaiDuInfoSearch('玛卡') print(bd.desc()) pprint.pprint(bd.info())