jtyoui.tools.SentenceMake 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time    : 2019/5/24 21:44
# @Email  : jtyoui@qq.com
# @Software: PyCharm
"""声明:该函数是爬取造句网(http://zaojv.com/)句子,仅仅用于学习交流。"""
import requests
from jtyoui.web import headers_ua
import re


[文档]class Sentence: """造句函数,输入一个词语,造出一些句子 >>> s = Sentence() >>> data = s['我爱你'] >>> for jz in data:print(jz) """ def __init__(self): self.url = 'http://zaojv.com/wordQueryDo.php' from bs4 import BeautifulSoup # 安装bs4: pip install bs4 self._BeautifulSoup = BeautifulSoup self.total = 1 self.page = 1 def _not_included_data(self, word): """未录取句子提取""" data_ls = [] response = requests.post(self.url, headers=headers_ua(), data={'wo': word, 'directGo': 1, 'nsid': 0}) text = response.text soup = self._BeautifulSoup(text, features="lxml") ls = soup.find('div', id='content').contents for d in ls: if not isinstance(d, str) and ('function' not in d.text): t = d.text.replace('\xa0', '').strip() if '造句' in t: t = re.sub('[((\\[【].*[^造]*造句.*[)】\\])]', '', t) data_ls.append(t) return data_ls def _analysis_data(self, text): """录用句子提取""" data_ls = [] soup = self._BeautifulSoup(text, features="lxml") page = soup.find('div', style='text-align:center;margin-top:10px;').text.strip().split('\xa0') self.page, self.total = page[7].split(r'/') ls = soup.find('div', id='all').contents for d in ls: if not isinstance(d, str) and ('function' not in d.text): t = d.text.replace('\xa0', '').strip() if '造句' in t: t = re.sub('[((\\[【].*[^造]*造句.*[)】\\])]', '', t) data_ls.append(t) return data_ls def _redirect_link(self, word): """获得重定向的链接""" response = requests.post(self.url, headers=headers_ua(), data={'wo': word, 'directGo': 1, 'nsid': 0}) return response.url @staticmethod def _get_page_data(url): """获得链接中的数据""" response = requests.get(url, headers=headers_ua()) return response.text
[文档] def make_sentence(self, word): """制造句子 :param word: 输入一个词语:比如:刘德华、万绮雯、宇宙等 """ data_ls = [] url_ = self._redirect_link(word) if url_ == self.url: d = self._not_included_data(word) else: text = self._get_page_data(url_) d = self._analysis_data(text) data_ls.extend(d) while self.total != self.page: page = int(self.page) + 1 u = url_[:-5] + '_' + str(page) + '.html' text = self._get_page_data(u) data_ls.extend(self._analysis_data(text)) return data_ls
def __getitem__(self, item): return self.make_sentence(item)
if __name__ == '__main__': s = Sentence() data = s['我爱你'] for jz in data: print(jz)