-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_dataset.py
62 lines (49 loc) · 1.64 KB
/
preprocess_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
from typing import Generator
def get_dataset_generator(path: str) -> Generator:
with open(path, "r") as fp:
for line in fp:
row = json.loads(line)
yield row
def filter_generator(g: Generator, filter_fn):
for item in g:
if filter_fn(item):
yield item
def stop_after(g, num_items):
for i, item in enumerate(g):
if i == num_items:
break
yield item
def clean_document(doc):
return {
"id": doc["id"],
"title": doc["title"].replace("\n", " "),
"abstract": doc["abstract"],
"categories": doc["categories"].split(" "),
"update_date": doc["update_date"],
}
documents_list = []
try:
with open("documents.json", "r") as fp:
for line in fp:
documents_list.append(json.loads(line))
except FileNotFoundError:
dataset_generator = get_dataset_generator(
path="arxiv-metadata-oai-snapshot.json"
)
def filter_relevant(doc):
for category in doc["categories"]:
if category.startswith("cs."):
return True
return False
documents = map(clean_document, dataset_generator)
documents = filter(filter_relevant, documents)
print(f"Generating in-memory documents structure")
documents_list = list(documents)
print(f"Writing {len(documents_list)} documents...")
with open("documents.json", "w") as fp:
for doc in documents_list:
fp.write(json.dumps(doc) + "\n")
print("Document examples:")
for doc in documents_list[:3]:
print(f"[{doc['update_date']}] {doc['title']} ({doc['categories']})")