def read_snli(data_dir, is_train):
"""Read the SNLI dataset into premises, hypotheses, and labels."""
def extract_text(s):
# Remove information that will not be used by us
s = re.sub('\\(', '', s)
s = re.sub('\\)', '', s)
# Substitute two or more consecutive whitespace with space
s = re.sub('\\s{2,}', ' ', s)
return s.strip()
label_set = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
file_name = os.path.join(data_dir, 'snli_1.0_train.txt'
if is_train else 'snli_1.0_test.txt')
with open(file_name, 'r') as f:
rows = [row.split('\t') for row in f.readlines()[1:]]
premises = [extract_text(row[1]) for row in rows if row[0] in label_set]
hypotheses = [extract_text(row[2]) for row in rows if row[0] in label_set]
labels = [label_set[row[0]] for row in rows if row[0] in label_set]
return premises, hypotheses, labels