上下位关系自动检测方法(论文复现)
概述
算法原理
Hearst 模式
上下位关系得分
核心逻辑
import spacy
import json
from tqdm import tqdm
import re
from collections import Counter
import numpy as np
import math
nlp = spacy.load("en_core_web_sm")
def clear_text(text):
"""对文本进行清理"""
# 这里可以添加自己的清理步骤
# 删去交叉引用标识,例如"[1]"
pattern = r'\[\d+\]'
result = re.sub(pattern, '', text)
return result
def split_sentences(text):
"""将文本划分为句子"""
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]
return sentences
def extract_noun_phrases(text):
"""从文本中抽取出术语"""
doc = nlp(text)
terms = []
# 遍历句子中的名词性短语(例如a type of robot)
for chunk in doc.noun_chunks:
term_parts = []
for token in list(chunk)[-1::]:
# 以非名词且非形容词,或是代词的词语为界,保留右半部分(例如robot)
if token.pos_ in ['NOUN', 'ADJ'] and token.dep_ != 'PRON':
term_parts.append(token.text)
else:
break
if term_parts != []:
term = ' '.join(term_parts)
terms.append(term)
return terms
def term_lemma(term):
"""将术语中的名词还原为单数"""
lemma = []
doc = nlp(term)
for token in doc:
if token.pos_ == 'NOUN':
lemma.append(token.lemma_)
else:
lemma.append(token.text)
return ' '.join(lemma)
def find_co_occurrence(sentence, terms, patterns):
"""找出共现于模板的术语对"""
pairs = []
# 两两之间匹配
for hyponym in terms:
for hypernym in terms:
if hyponym == hypernym:
continue
for pattern in patterns:
# 将模板中的占位符替换成候选上下位词
pattern = pattern.replace('__HYPONYM__', re.escape(hyponym))
pattern = pattern.replace('__HYPERNYM__', re.escape(hypernym))
# 在句子中匹配
if re.search(pattern, sentence) != None:
# 将名词复数还原为单数
pairs.append((term_lemma(hyponym), term_lemma(hypernym)))
return pairs
def count_unique_tuple(tuple_list):
"""统计列表中独特元组出现次数"""
counter = Counter(tuple_list)
result = [{"tuple": unique, "count": count} for unique, count in counter.items()]
return result
def find_rth_largest(arr, r):
"""找到第r大的元素"""
rth_largest_index = np.argpartition(arr, -r)[-r]
return arr[rth_largest_index]
def find_pairs(corpus_file, patterns, disable_tqdm=False):
"""读取文件并找出共现于模板的上下位关系术语对"""
pairs = []
# 按行读取语料库
lines = corpus_file.readlines()
for line in tqdm(lines, desc="Finding pairs", ascii=" 123456789#", disable=disable_tqdm):
# 删去首尾部分的空白字符
line = line.strip()
# 忽略空白行
if line == '':
continue
# 清理文本
line = clear_text(line)
# 按句处理
sentences = split_sentences(line)
for sentence in sentences:
# 抽取出句子中的名词性短语并分割成术语
candidates_terms = extract_noun_phrases(sentence)
# 找出共现于模板的术语对
pairs = pairs + find_co_occurrence(sentence, candidates_terms, patterns)
return pairs
def spmi_calculate(configs, unique_pairs):
"""基于对共现频率的统计,计算任意两个术语间的spmi得分"""
# 计算每个术语分别作为上下位词的出现频次
terms = list(set([pair["tuple"][0] for pair in unique_pairs] + [pair["tuple"][1] for pair in unique_pairs]))
term_count = {term: {'hyponym_count': 0, 'hypernym_count': 0} for term in terms}
all_count = 0
for pair in unique_pairs:
term_count[pair["tuple"][0]]['hyponym_count'] += pair["count"]
term_count[pair["tuple"][1]]['hypernym_count'] += pair["count"]
all_count += pair["count"]
# 计算PPMI矩阵
ppmi_matrix = np.zeros((len(terms), len(terms)), dtype=np.float32)
for pair in unique_pairs:
hyponym = pair["tuple"][0]
hyponym_id = terms.index(hyponym)
hypernym = pair["tuple"][1]
hypernym_id = terms.index(hypernym)
ppmi = (pair["count"] * all_count) / (term_count[hyponym]['hyponym_count'] * term_count[hypernym]['hypernym_count'])
ppmi = max(0, math.log(ppmi))
ppmi_matrix[hyponym_id, hypernym_id] = ppmi
# 对PPMI进行奇异值分解并截断
r = configs['clip']
U, S, Vt = np.linalg.svd(ppmi_matrix)
S[S < find_rth_largest(S, r)] = 0
S_r = np.diag(S)
# 计算任意两个术语间的spmi
paris2spmi = []
for hyponym_id in range(len(terms)):
for hypernym_id in range(len(terms)):
# 同一个术语间不计算得分
if hyponym_id == hypernym_id:
continue
spmi = np.dot(np.dot(U[hyponym_id , :], S_r), Vt[:, hypernym_id]).item()
# 保留得分大于阈值的术语对
if spmi > configs["threshold"]:
hyponym = terms[hyponym_id]
hypernym = terms[hypernym_id]
paris2spmi.append({"hyponym": hyponym, "hypernym": hypernym, "spmi": spmi})
# 按spmi从大到小排序
paris2spmi = sorted(paris2spmi, key=lambda x: x["spmi"], reverse=True)
return paris2spmi
if __name__ == "__main__":
# 读取配置文件
with open('config.json', 'r') as config_file:
configs = json.load(config_file)
# 读取模板
with open(configs['patterns_path'], 'r') as patterns_file:
patterns = json.load(patterns_file)
# 语料库中共现于模板的术语对
with open(configs['corpus_path'], 'r', encoding='utf-8') as corpus_file:
pairs = find_pairs(corpus_file, patterns)
# 统计上下位关系的出现频次
unique_pairs = count_unique_tuple(pairs)
with open(configs["pairs_path"], 'w') as pairs_file:
json.dump(unique_pairs, pairs_file, indent=6, ensure_ascii=True)
# 计算任意两个术语间的spmi得分
paris2spmi = spmi_calculate(configs, unique_pairs)
with open(configs['spmi_path'], 'w') as spmi_file:
json.dump(paris2spmi, spmi_file, indent=6, ensure_ascii=True)
效果演示
使用方式
unzip Revisit-Hearst-Pattern.zip
cd Revisit-Hearst-Pattern
pip install -r requirements.txt
python -m spacy download en_core_web_sm
python main.py
python main-flask.py