上下位关系自动检测方法(论文复现)

概述

上下位关系自动检测方法(论文复现)-LMLPHP

算法原理
Hearst 模式

上下位关系自动检测方法(论文复现)-LMLPHP

上下位关系得分

上下位关系自动检测方法(论文复现)-LMLPHP

上下位关系自动检测方法(论文复现)-LMLPHP

上下位关系自动检测方法(论文复现)-LMLPHP

核心逻辑
import spacy
import json
from tqdm import tqdm
import re
from collections import Counter
import numpy as np
import math

nlp = spacy.load("en_core_web_sm")

def clear_text(text):
    """对文本进行清理"""
    # 这里可以添加自己的清理步骤
    # 删去交叉引用标识,例如"[1]"
    pattern = r'\[\d+\]'
    result = re.sub(pattern, '', text)
    return result

def split_sentences(text):
    """将文本划分为句子"""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def extract_noun_phrases(text):
    """从文本中抽取出术语"""
    doc = nlp(text)
    terms = []
    # 遍历句子中的名词性短语(例如a type of robot)
    for chunk in doc.noun_chunks:
        term_parts = []
        for token in list(chunk)[-1::]:
            # 以非名词且非形容词,或是代词的词语为界,保留右半部分(例如robot)
            if token.pos_ in ['NOUN', 'ADJ'] and token.dep_ != 'PRON':
                term_parts.append(token.text)
            else:
                break
        if term_parts != []:
            term = ' '.join(term_parts)
            terms.append(term)
    return terms

def term_lemma(term):
    """将术语中的名词还原为单数"""
    lemma = []
    doc = nlp(term)
    for token in doc:
        if token.pos_ == 'NOUN':
            lemma.append(token.lemma_)
        else:
            lemma.append(token.text)
    return ' '.join(lemma)

def find_co_occurrence(sentence, terms, patterns):
    """找出共现于模板的术语对"""
    pairs = []
    # 两两之间匹配
    for hyponym in terms:
        for hypernym in terms:
            if hyponym == hypernym:
                continue
            for pattern in patterns:
                # 将模板中的占位符替换成候选上下位词
                pattern = pattern.replace('__HYPONYM__', re.escape(hyponym))
                pattern = pattern.replace('__HYPERNYM__', re.escape(hypernym))
                # 在句子中匹配
                if re.search(pattern, sentence) != None:
                    # 将名词复数还原为单数
                    pairs.append((term_lemma(hyponym), term_lemma(hypernym)))
    return pairs

def count_unique_tuple(tuple_list):
    """统计列表中独特元组出现次数"""
    counter = Counter(tuple_list)
    result = [{"tuple": unique, "count": count} for unique, count in counter.items()]
    return result

def find_rth_largest(arr, r):
    """找到第r大的元素"""
    rth_largest_index = np.argpartition(arr, -r)[-r]
    return arr[rth_largest_index]

def find_pairs(corpus_file, patterns, disable_tqdm=False):
    """读取文件并找出共现于模板的上下位关系术语对"""
    pairs = []
    # 按行读取语料库
    lines = corpus_file.readlines()
    for line in tqdm(lines, desc="Finding pairs", ascii=" 123456789#", disable=disable_tqdm):
        # 删去首尾部分的空白字符
        line = line.strip()
        # 忽略空白行
        if line == '':
            continue
        # 清理文本
        line = clear_text(line)
        # 按句处理
        sentences = split_sentences(line)
        for sentence in sentences:
            # 抽取出句子中的名词性短语并分割成术语
            candidates_terms = extract_noun_phrases(sentence)
            # 找出共现于模板的术语对
            pairs = pairs + find_co_occurrence(sentence, candidates_terms, patterns)
    return pairs

def spmi_calculate(configs, unique_pairs):
    """基于对共现频率的统计,计算任意两个术语间的spmi得分"""
    # 计算每个术语分别作为上下位词的出现频次
    terms = list(set([pair["tuple"][0] for pair in unique_pairs] + [pair["tuple"][1] for pair in unique_pairs]))
    term_count = {term: {'hyponym_count': 0, 'hypernym_count': 0} for term in terms}
    all_count = 0
    for pair in unique_pairs:
        term_count[pair["tuple"][0]]['hyponym_count'] += pair["count"]
        term_count[pair["tuple"][1]]['hypernym_count'] += pair["count"]
        all_count += pair["count"]
    # 计算PPMI矩阵 
    ppmi_matrix = np.zeros((len(terms), len(terms)), dtype=np.float32)
    for pair in unique_pairs:
        hyponym = pair["tuple"][0]
        hyponym_id = terms.index(hyponym)
        hypernym = pair["tuple"][1]
        hypernym_id = terms.index(hypernym)
        ppmi = (pair["count"] * all_count) / (term_count[hyponym]['hyponym_count'] * term_count[hypernym]['hypernym_count'])
        ppmi = max(0, math.log(ppmi))
        ppmi_matrix[hyponym_id, hypernym_id] = ppmi
    # 对PPMI进行奇异值分解并截断
    r = configs['clip']
    U, S, Vt = np.linalg.svd(ppmi_matrix)
    S[S < find_rth_largest(S, r)] = 0
    S_r = np.diag(S)
    # 计算任意两个术语间的spmi
    paris2spmi = []
    for hyponym_id in range(len(terms)):
        for hypernym_id in range(len(terms)):
            # 同一个术语间不计算得分
            if hyponym_id == hypernym_id:
                continue
            spmi = np.dot(np.dot(U[hyponym_id , :], S_r), Vt[:, hypernym_id]).item()
            # 保留得分大于阈值的术语对
            if spmi > configs["threshold"]:
                hyponym = terms[hyponym_id]
                hypernym = terms[hypernym_id]
                paris2spmi.append({"hyponym": hyponym, "hypernym": hypernym, "spmi": spmi})
    # 按spmi从大到小排序
    paris2spmi = sorted(paris2spmi, key=lambda x: x["spmi"], reverse=True)
    return paris2spmi

if __name__ == "__main__":
    # 读取配置文件
    with open('config.json', 'r') as config_file:
        configs = json.load(config_file)
    # 读取模板
    with open(configs['patterns_path'], 'r') as patterns_file:
        patterns = json.load(patterns_file)
    # 语料库中共现于模板的术语对
    with open(configs['corpus_path'], 'r', encoding='utf-8') as corpus_file:
        pairs = find_pairs(corpus_file, patterns)
    # 统计上下位关系的出现频次
    unique_pairs = count_unique_tuple(pairs)
    with open(configs["pairs_path"], 'w') as pairs_file:
        json.dump(unique_pairs, pairs_file, indent=6, ensure_ascii=True)
    # 计算任意两个术语间的spmi得分
    paris2spmi = spmi_calculate(configs, unique_pairs)
    with open(configs['spmi_path'], 'w') as spmi_file:
        json.dump(paris2spmi, spmi_file, indent=6, ensure_ascii=True)
效果演示

上下位关系自动检测方法(论文复现)-LMLPHP

使用方式
unzip Revisit-Hearst-Pattern.zip
cd Revisit-Hearst-Pattern
pip install -r requirements.txt
python -m spacy download en_core_web_sm
python main.py
python main-flask.py

上下位关系自动检测方法(论文复现)-LMLPHP

10-01 07:16