jtyoui.neuralNetwork.communal.Vocabs 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2019/8/1 15:53
# @Author: Jtyoui@qq.com
from collections import Counter
import json


# 没有在vocab是1
# 填充是0
# 结尾符是2
[文档]def read_test(path):
    vocab = json.load(fp=open('./vocab.json'))
    test = []
    with open(path) as fp:
        for line in fp:
            lines = line.strip().split('_')
            m = map(lambda x: vocab.get(x, 1), lines)
            test.append(list(m))
    return test


[文档]def read_train(tag, path):
    train = []
    test = []
    with open(path) as fp:
        data = fp.read().split('\n\n')[:-1]
        vocab = json.load(fp=open('./vocab.json'))
        for ds in data:
            d, t = [], []
            lines = ds.split('\n')
            for line in lines:
                k, v = line.split('\t')
                d.append(vocab.get(k, 1))
                t.append([tag[v]])
            train.append(d)
            test.append(t)
    return train, test


[文档]def get_vocab(path):
    v = []
    with open(path) as fp:
        for line in fp:
            data = line.strip().split('_')
            v.extend(data)
    c = Counter(v)
    f = filter(lambda x: x[1] >= 10, c.items())
    s = sorted(f, key=lambda x: x[1], reverse=True)
    vocab = {}
    for index, (k, _) in enumerate(s, start=3):
        vocab[k] = index
    json.dump(vocab, fp=open('./vocab.json', 'w'))
    return vocab


if __name__ == '__main__':
    get_vocab(path=None)
    tags = {'O': 0,
            'B-a': 1, 'I-a': 2,
            'B-b': 3, 'I-b': 4,
            'B-c': 5, 'I-c': 6
            }
    trains, tests = read_train(tags, path=None)
    print(trains)
    print(tests)
    print(read_test(path=None))