jtyoui.statistics.distance.HammingDistance 源代码

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2019/3/18 9:36
# @Author: Jtyoui@qq.com
from jtyoui.decorators import replace_regular
from jtyoui.regular import punctuation_re

import hashlib

"""
海明距离
在信息编码中,两个合法代码对应位上编码不同的位数称为码距,又称海明距离。
举例如下:10101和00110从第一位开始依次有第一位、第四、第五位不同,则海明距离为3。
"""


[文档]def handle(participle_ls, weight, f): """将内容转成字典格式 :param participle_ls: 文本分词内容,是一个list分词对象 :param weight: 特征值 :param f: simHash的bit位数 :return: 海明距离值 """ c = [] for ls in participle_ls: if weight: v = weight.get(ls, 1) else: v = 1 c.append((ls, v)) return features_dict(c, f)
[文档]def hash_func(x): """hash算法""" return int(hashlib.md5(x).hexdigest(), 16)
[文档]def features_dict(features, f): """特征值字典 :param features: 特征值 :param f: simHash的bit位数 :return: simHash值 """ v = [0] * f masks = [1 << i for i in range(f)] for values, weight in features: h = hash_func(values.encode('utf-8')) for i in range(f): v[i] += weight if h & masks[i] else -weight value = 0 for i in range(f): if v[i] > 0: value |= masks[i] return value
[文档]def distance(sim_hash, another, f): """计算两个simHash的距离 :param sim_hash: simHash值 :param another: 另一个simHash的值 :param f: simHash的bit位数 :return: 海明距离 """ x = (sim_hash ^ another) & ((1 << f) - 1) value = 0 while x: value += 1 x &= x - 1 return value
[文档]def ham_distance(chars, other_chars, weight=None, f=64): """比较那个字符串的海明距离 :param chars: 字符串 :param other_chars: 另一个字符串 :param weight: 权重字典:weight={"电影": 3} :param f: samHash的bit位数 :return: 海明距离值 """ v0 = handle(chars, weight, f) v1 = handle(other_chars, weight, f) return distance(v0, v1, f)
[文档]@replace_regular(punctuation_re, '') def simHash_similarity(text1: (str, dict), text2: (str, dict), weight: dict = None, f: int = 64) -> float: """文本相似度算法 :param text1: 文本1 :param text2: 文本2 :param weight: 文本词权重 :param f: hash bit位数 :return: 相似度 """ v = ham_distance(text1, text2, weight=weight, f=f) return 1 - v / f
if __name__ == '__main__': a = ['我', '吃饭', '了', '明天', '去', '看', '电影'] b = ['我', '在', '吃饭', '了', '马上', '去', '看', '电影'] print(ham_distance(a, b, weight={"电影": 3})) print(simHash_similarity(''.join(a), ''.join(b)))