以翻译文件为例,一些数据集通常是翻译原句和翻译后的句子不在同一个文件中,但是一一对应的关系,可以先使用pandas生成csv或json格式的文件,然后再用torchtext处理,代码实现如下:

数据格式:

英文和德文分别存放在两个不同的文件中,每句话使用回车隔开。

import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from torchtext.data import Field, TabularDataset, BucketIterator

english_txt = open('data/test_WMT_english_newstest2015.txt', encoding='utf-8').read().split('\n')
german_txt = open('data/test_WMT_german_newstest2015.txt', encoding='utf-8').read().split('\n')

print(english_txt[:3])
print(german_txt[:3])


raw_data = {'English': [line for line in english_txt[1:1000]],
            'German': [line for line in german_txt[1:1000]]}

df = pd.DataFrame(raw_data, columns=['English', 'German'])

train, test = train_test_split(df, test_size=0.2)

train.to_json('data/train.json', orient='records', lines=True)
test.to_json('data/test.json', orient='records', lines=True)

train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)

spacy_eng = spacy.load('en')
spacy_ger = spacy.load('de')


def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

ENGLISH = Field(sequential=True, use_vocab=True, tokenize=tokenizer_eng, lower=True, batch_first=True)
GERMAN = Field(sequential=True, use_vocab=True, tokenize=tokenizer_ger, lower=True, batch_first=True)

fields = {'English': ('eng', ENGLISH), 'German': ('ger', GERMAN)}

train_data, test_data = TabularDataset.splits(path='data',
                                              train='train.json',
                                              test='test.json',
                                              format='json',
                                              fields=fields)

ENGLISH.build_vocab(train_data, max_size=10000, min_freq=2)
GERMAN.build_vocab(train_data, max_size=10000, min_freq=2)


print('English padding tokenizer: ', ENGLISH.vocab.stoi[ENGLISH.pad_token])
print('English unknow tokenizer: ', ENGLISH.vocab.stoi[ENGLISH.unk_token])
print('German padding tokenizer: ', GERMAN.vocab.stoi[GERMAN.pad_token])
print('German unknow tokenizer: ', GERMAN.vocab.stoi[GERMAN.unk_token])


train_iterator, test_iterator = BucketIterator.splits((train_data, test_data),
                                                      batch_size=32,
                                                      device='cuda',
                                                      sort_key=lambda x: len(x.eng),
                                                      sort=False,
                                                      sort_within_batch=True)

for batch in train_iterator:
    print(batch.eng)
    print(batch.ger)
    break
09-09 07:27