jtyoui.jp.Japan 源代码

# -*- coding: utf-8 -*-
# @Time    : 2019/5/7 19:48
# @Email  : jtyoui@qq.com
# @Software: PyCharm
from urllib.parse import quote
from jtyoui.error import LibraryNotInstallError
from jtyoui.decorators import replace_regular
from jtyoui.web import header
import requests

# 记住下次换缓存
Response_Headers = """cookie: HJ_UID=a0752831-ba30-9486-ddc8-66bcbb7f303a; _REF=https://www.baidu.com/link?url%3DTvv2c125EbCEB2T5xBtSlQeMb4zSO1v2ZkeB8uvhFXacQdks-Z0OXCabLXXX-Wpa&wd%3D&eqid%3D9cbe02970005752c000000025cd1684b; _REG=www.baidu.com|; _SREG_3=www.baidu.com|; HJ_CST=0; HJ_CSST_3=0; _SREF_3=https://www.baidu.com/link?url%3DTvv2c125EbCEB2T5xBtSlQeMb4zSO1v2ZkeB8uvhFXacQdks-Z0OXCabLXXX-Wpa&wd%3D&eqid%3D9cbe02970005752c000000025cd1684b; TRACKSITEMAP=3%2C6%2C; HJ_SID=5b45f4b2-9ae1-4a93-803b-7e2d08faba78; HJ_SSID_3=e959e1f7-6b25-4c49-a26d-a914297c0f32
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"""

word = {
    '自动词・五段/一类': '自Ⅰ',
    '形容动词/ナ形容词': '形Ⅱ',
    '他动词・一段/二类': '他Ⅱ',
    '他动词・五段/一类': '他Ⅰ',
    '自动词・一段/二类': '自Ⅱ',
    '形容词/イ形容词': '形Ⅰ',
    '自动词・サ变/三类': '自Ⅲ',
    '他动词・サ变/三类': '他Ⅲ',
    '他・サ变/三类': '他Ⅲ',
    '他・一段/二类': '他Ⅱ',
    '他・五段/一类': '他Ⅰ',

[文档]@replace_regular(r'((.+))|《(.+)》|[0-9].', '') def sub_names(names): return names
[文档]def analysis(data_name, response_headers=Response_Headers): """分析单词、爬去小D网站:https://dict.hjenglish.com/jp/jc/%E3%81%AB%E3%81%8E%E3%82%8F%E3%81%86""" try: from bs4 import BeautifulSoup # pip install bs4 except Exception as _: raise LibraryNotInstallError('安装 pip install bs4') total = [] ws = open(data_name + '.data', 'w', encoding='utf8') with open(data_name, encoding='utf8') as f: for line in f: lines = line[:-1] if len(lines) > 1 and (lines not in total): total.append(lines) name = lines url = F'https://dict.hjenglish.com/jp/jc/{quote(name)}' response = requests.get(url=url, headers=header(response_headers)) data = response.text sp = BeautifulSoup(data, 'html.parser') pjm = sp.find(class_='pronounces') if pjm: pjm = pjm.span.text[1:-1] else: continue cx = sp.find(class_='simple') x = cx.h2 if x: x = x.text[1:-1] else: continue m = cx.ul.text.replace('\n', '') m = sub_names(m).replace('。', ';')[:-1] if len(m) > 40: continue if name != pjm: name = "(" + name + ")" else: name = None x = '<' + x + '>' for k, v in word.items(): x = x.replace(k, v) x = x.replace('词', '') if len(m) <= 1: continue if name: string = F'{pjm}{name}\t{x}\t{m}' else: string = F'{pjm}\t{x}\t{m}' print(string) ws.write(string + '\n') ws.flush()
[文档]def cut(text_name): """从原始数据里面提取日语单词 :param text_name: 原始数据的文件地址 :return: 返回提取数据文本的地址 """ try: from janome.tokenizer import Tokenizer # pip install janome except Exception as _: raise LibraryNotInstallError('安装 pip install janome') w = open(text_name + '.jp', 'w', encoding='utf8') t = Tokenizer() fp = open(text_name, encoding='utf-8') for token in t.tokenize(fp.read()): words = token.base_form words = words.replace(' ', '') if len(words) > 1: w.write(words + '\n') return text_name + '.jp'
if __name__ == '__main__': c = cut('2011年日语高考真题.txt') analysis(c)