Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PubMed IDs #227

Merged
merged 27 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c3b9509
Started adding support for recursive download.
gaurav Jan 23, 2024
5f09c14
Added a basic recursion option for Wget.
gaurav Jan 30, 2024
32224fd
First stab at publication download.
gaurav Jan 30, 2024
1976a64
Working PubMed download code, in-progress parsing code.
gaurav Jan 30, 2024
6578eda
Apparently working title extraction.
gaurav Jan 30, 2024
c3285c0
Fixed parsing code.
gaurav Jan 30, 2024
533955e
First stab at generating a Publication compendium.
gaurav Feb 19, 2024
b048d9c
Added code to create compendia and synonym directories if missing.
gaurav Feb 23, 2024
222b809
Improved inputs.
gaurav Mar 26, 2024
c06c14e
Noted outputs as directories.
gaurav Mar 26, 2024
88bc98e
Oops, directory() flag is only for outputs.
gaurav Mar 26, 2024
15482a7
Added publication_outputs to reports.
gaurav Mar 28, 2024
0b200e3
First stab at adding titles.
gaurav Mar 28, 2024
a0c93f4
Fixed path to PubMed titles.
gaurav Mar 28, 2024
ba37d54
Added updatefiles, fixed PubStatuses.
gaurav Apr 4, 2024
e86393a
Improved indentation and warnings.
gaurav Apr 4, 2024
433f78d
Workaround for lack of XPath support.
gaurav Apr 4, 2024
103bfad
Updated PubStatus to record all statuses.
gaurav Apr 4, 2024
6932885
Added an option to track PubStatuses as we go.
gaurav Apr 4, 2024
a7349f4
Add PMCIDs.
gaurav Apr 4, 2024
ed340fa
Oops, it's PMC, not PMCID.
gaurav Apr 4, 2024
3ed1b5f
Fixed bug in PMC output -- missing newline.
gaurav Apr 5, 2024
71f596a
Write out the status file as a JSONL gzipped file.
gaurav Apr 5, 2024
7c61e4a
Write the titles file as titles.tsv to make it easier to use.
gaurav Apr 5, 2024
c259c69
Improved documentation.
gaurav Apr 15, 2024
17f9c3a
Added Publication assessments and added to Snakefile.
gaurav Apr 15, 2024
680c01b
Added Publication.txt to expected synonyms.
gaurav Apr 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ include: "src/snakefiles/taxon.snakefile"
include: "src/snakefiles/genefamily.snakefile"
include: "src/snakefiles/leftover_umls.snakefile"
include: "src/snakefiles/macromolecular_complex.snakefile"
include: "src/snakefiles/publications.snakefile"

include: "src/snakefiles/reports.snakefile"
include: "src/snakefiles/exports.snakefile"
Expand Down Expand Up @@ -46,6 +47,7 @@ rule all_outputs:
config['output_directory'] + '/reports/umls_done',
config['output_directory'] + '/reports/macromolecular_complex_done',
config['output_directory'] + '/reports/drugchemical_done',
config['output_directory'] + '/reports/publications_done',
output:
x = config['output_directory'] + '/reports/outputs_done'
shell:
Expand Down
2 changes: 2 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
"http://www.informatics.jax.org/marker/MGI:": "MGI"
},

"publication_outputs": ["Publication.txt"],

"geneprotein_outputs": ["GeneProtein.txt"],
"drugchemical_outputs": ["DrugChemical.txt"],

Expand Down
61 changes: 53 additions & 8 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import subprocess
import traceback
from enum import Enum
from ftplib import FTP
from io import BytesIO
import gzip
Expand Down Expand Up @@ -207,30 +208,47 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
return out_file_name


# Recursion options for pull_via_wget().
# See https://www.gnu.org/software/wget/manual/html_node/Recursive-Download.html for wget's recursion options.
class WgetRecursionOptions(Enum):
NO_RECURSION = 0 # Don't do any recursion
RECURSE_SUBFOLDERS = 1 # Recurse into subfolders -- equivalent to `-np`
RECURSE_DIRECTORY_ONLY = 2 # Recurse through a single directory only -- equivalent to `-np -l1`


def pull_via_wget(
url_prefix: str,
in_file_name: str,
decompress=True,
subpath:str=None,
outpath:str=None,
continue_incomplete:bool=True,
recurse:WgetRecursionOptions = WgetRecursionOptions.NO_RECURSION,
retries:int=10):
"""
Download a file using wget. We call wget from the command line, and use command line options to
request continuing incomplete downloads.

:param url_prefix: The URL prefix to download.
:param in_file_name: The filename to download -- this will be concatenated to the URL prefix. This should include
the compression extension (e.g. `.gz`); we will remove that extension during decompression.
the compression extension (e.g. `.gz`); we will remove that extension during decompression. If recursion is
turned on, in_file_name refers to the directory where the recursive content will be downloaded.
:param decompress: Whether this is a Gzip file that should be decompressed after download.
:param subpath: The subdirectory of `babel_download` where this file should be stored.
:param outpath: The full output directory to write this file to. Both subpath and outpath cannot be set at the same time.
:param continue_incomplete: Should wget continue an incomplete download?
:param recurse: Do we want to download recursively? Should be from Wget_Recursion_Options, such as Wget_Recursion_Options.NO_RECURSION.
:param retries: The number of retries to attempt.
"""

# Prepare download URL and location
download_dir = get_config()['download_directory']
url = url_prefix + in_file_name
if subpath:
if subpath and outpath:
raise RuntimeError("pull_via_wget() cannot be called with both subpath and outpath set.")
elif outpath:
dl_file_name = outpath
elif subpath:
dl_file_name = os.path.join(download_dir, subpath, in_file_name)
else:
dl_file_name = os.path.join(download_dir, in_file_name)
Expand All @@ -247,7 +265,18 @@ def pull_via_wget(

# Add URL and output file.
wget_command_line.append(url)
wget_command_line.extend(['-O', dl_file_name])

# Handle recursion options
match recurse:
case WgetRecursionOptions.NO_RECURSION:
# Write to a single file, dl_file_name
wget_command_line.extend(['-O', dl_file_name])
case WgetRecursionOptions.RECURSE_SUBFOLDERS:
# dl_file_name should be a directory name.
wget_command_line.extend(['--recursive', '--no-parent', '--no-directories', '--directory-prefix=' + dl_file_name])
case WgetRecursionOptions.RECURSE_DIRECTORY_ONLY:
# dl_file_name should be a directory name.
wget_command_line.extend(['--recursive', '--no-parent', '--no-directories', '--level=1', '--directory-prefix=' + dl_file_name])

# Execute wget.
logging.info(f"Downloading {dl_file_name} using wget: {wget_command_line}")
Expand All @@ -266,12 +295,21 @@ def pull_via_wget(
else:
raise RuntimeError(f"Don't know how to decompress {in_file_name}")

if os.path.isfile(uncompressed_filename):
file_size = os.path.getsize(uncompressed_filename)
if file_size > 0:
if os.path.isfile(uncompressed_filename):
file_size = os.path.getsize(uncompressed_filename)
logging.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.")
else:
raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')
else:
raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')
if os.path.isfile(dl_file_name):
file_size = os.path.getsize(dl_file_name)
logging.info(f"Downloaded {dl_file_name} from {url}, file size {file_size} bytes.")
elif os.path.isdir(dl_file_name):
# Count the number of files in directory dl_file_name
dir_size = sum(os.path.getsize(os.path.join(dl_file_name, f)) for f in os.listdir(dl_file_name) if os.path.isfile(os.path.join(dl_file_name, f)))
logging.info(f"Downloaded {dir_size} files from {url} to {dl_file_name}.")
else:
raise RuntimeError(f'Unknown file type {dl_file_name}')


def sort_identifiers_with_boosted_prefixes(identifiers, prefixes):
Expand Down Expand Up @@ -316,7 +354,8 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
:param synonym_list:
:param ofname:
:param node_type:
:param labels:
:param labels: A map of identifiers
Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx).
:param extra_prefixes: We default to only allowing the prefixes allowed for a particular type in Biolink.
If you want to allow additional prefixes, list them here.
:param icrdf_filename: (REQUIRED) The file to read the information content from (icRDF.tsv). Although this is a
Expand Down Expand Up @@ -345,6 +384,12 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
description_factory = DescriptionFactory(make_local_name(''))
taxon_factory = TaxonFactory(make_local_name(''))
node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes)

# Create compendia and synonyms directories, just in case they haven't been created yet.
os.makedirs(os.path.join(cdir, 'compendia'), exist_ok=True)
os.makedirs(os.path.join(cdir, 'synonyms'), exist_ok=True)

# Write compendium and synonym files.
with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile:
for slist in synonym_list:
node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes)
Expand Down
1 change: 1 addition & 0 deletions src/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@
PROCEDURE = 'biolink:Procedure'
PROTEIN = 'biolink:Protein'
PUBLICATION = 'biolink:Publication'
JOURNAL_ARTICLE = 'biolink:JournalArticle'
SMALL_MOLECULE = 'biolink:SmallMolecule'
210 changes: 210 additions & 0 deletions src/createcompendia/publications.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import gzip
import json
import logging
import os
import time
from collections import defaultdict
from pathlib import Path
import xml.etree.ElementTree as ET

from src.babel_utils import pull_via_wget, WgetRecursionOptions, glom, read_identifier_file, write_compendium
from src.categories import JOURNAL_ARTICLE, PUBLICATION
from src.prefixes import PMID, DOI, PMC


def download_pubmed(download_file,
pubmed_base='ftp://ftp.ncbi.nlm.nih.gov/pubmed/',
pmc_base='https://ftp.ncbi.nlm.nih.gov/pub/pmc/'):
"""
Download PubMed. We download both the PubMed annual baseline and the daily update files,
which are in the same format, but the baseline is set up at the start of the year and then
updates are included in the daily update files.

:param download_file: A `done` file that should be created to indicate that we are done.
:param pubmed_base: The PubMed base URL to download files from.
"""

# Create directories if they don't exist.
os.makedirs(os.path.dirname(download_file), exist_ok=True)

# Step 1. Download all the files for the PubMed annual baseline.
pull_via_wget(
pubmed_base, 'baseline',
decompress=False,
subpath='PubMed',
recurse=WgetRecursionOptions.RECURSE_SUBFOLDERS)

# Step 2. Download all the files for the PubMed update files.
pull_via_wget(
pubmed_base, 'updatefiles',
decompress=False,
subpath='PubMed',
recurse=WgetRecursionOptions.RECURSE_SUBFOLDERS)

# Step 3. Download the PMC/PMID mapping file from PMC.
pull_via_wget(pmc_base, 'PMC-ids.csv.gz', decompress=True, subpath='PubMed')

# We're all done!
Path.touch(download_file)


def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_file, pmid_id_file,
pmid_doi_concord_file):
"""
Read through the PubMed files in the baseline_dir and updatefiles_dir, and writes out label and status information.

:param baseline_dir: The PubMed baseline directory to parse.
:param updatefiles_dir: The PubMed updatefiles directory to parse.
:param titles_file: An output TSV file in the format `<PMID>\t<TITLE>`.
:param status_file: A JSON file containing publication status information.
:param pmid_doi_concord_file: A concord file in the format `<PMID>\teq\t<DOI>` and other identifiers.
"""

# We can write labels and concords as we go.
with open(titles_file, 'w') as titlesf, open(pmid_id_file, 'w') as pmidf, open(pmid_doi_concord_file,
'w') as concordf:
# Track PubMed article statuses. In theory the final PubMed entry should have all the dates, which should
# tell us the final status of a publication, but really we just want to know if the article has ever been
# marked as retracted, so instead we track every status that has ever been attached to any article. We
# don't have a way of tracking properties yet (https://github.com/TranslatorSRI/Babel/issues/155), so for now
# we write this out in JSON to the status_file.
pmid_status = defaultdict(set)

# Read every file in the baseline and updatefiles directories (they have the same format).
baseline_filenames = list(map(lambda fn: os.path.join(baseline_dir, fn), sorted(os.listdir(baseline_dir))))
updatefiles_filenames = list(
map(lambda fn: os.path.join(updatefiles_dir, fn), sorted(os.listdir(updatefiles_dir))))

for pubmed_filename in (baseline_filenames + updatefiles_filenames):
if not pubmed_filename.endswith(".xml.gz"):
logging.warning(f"Skipping non-gzipped-XML file {pubmed_filename} in PubMed files.")
continue

with gzip.open(pubmed_filename, 'rt') as pubmedf:
logging.info(f"Parsing PubMed Baseline {pubmed_filename}")

start_time = time.time_ns()
count_articles = 0
count_pmids = 0
count_dois = 0
count_pmcs = 0
count_titles = 0
file_pubstatuses = set()

# Read every XML entry from every PubMed file.
parser = ET.XMLPullParser(['end'])
for line in pubmedf:
parser.feed(line)
for event, elem in parser.read_events():
if event == 'end' and elem.tag == 'PubmedArticle':
count_articles += 1

# Look for the pieces of information we want.
pmids = elem.findall("./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']")
dois = elem.findall("./PubmedData/ArticleIdList/ArticleId[@IdType='doi']")
pmcs = elem.findall("./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']")
titles = elem.findall('.//ArticleTitle')

# Retrieve the PubDates containing PubStatuses.
pubdates_with_pubstatus = elem.findall("./PubmedData/History/PubMedPubDate[@PubStatus]")
pubstatuses = set()
for pubdate in pubdates_with_pubstatus:
# We ignore the dates, and instead record all the PubStatuses that a PMID has ever had.
if pubdate.get('PubStatus'):
pubstatuses.add(pubdate.get('PubStatus'))

# Write information for each PMID.
for pmid in pmids:
count_pmids += 1

# Write out PMID type.
pmidf.write(f"{PMID}:{pmid.text}\t{JOURNAL_ARTICLE}\n")

# Update PMID status.
pmid_status[f'{PMID}:' + pmid.text].update(pubstatuses)
file_pubstatuses.update(pubstatuses)

# Write out the titles.
for title in titles:
count_titles += 1
# Convert newlines into '\n'.
title_text = title.text
if not title_text:
continue
title_text = title_text.replace('\n', '\\n')

titlesf.write(f"{PMID}:{pmid.text}\t{title_text}\n")

# Write out the DOIs to the concords file.
for doi in dois:
count_dois += 1
concordf.write(f"{PMID}:{pmid.text}\teq\t{DOI}:{doi.text}\n")

# Write out the PMCIDs to the concords file.
for pmc in pmcs:
count_pmcs += 1
concordf.write(f"{PMID}:{pmid.text}\teq\t{PMC}:{pmc.text}\n")

time_taken_in_seconds = float(time.time_ns() - start_time) / 1_000_000_000
logging.info(
f"Parsed {count_articles} articles from PubMed {pubmed_filename} in " +
f"{time_taken_in_seconds:.4f} seconds: {count_pmids} PMIDs, {count_dois} DOIs, " +
f"{count_pmcs} PMCs, " +
f"{count_titles} titles with the following PubStatuses: {sorted(file_pubstatuses)}.")

# Write the statuses into a gzipped JSONL file.
with gzip.open(status_file, 'wt') as statusf:
# This will be more readable as a JSONL file, so let's write it out that way.
for pmid, statuses in pmid_status.items():
statusf.write(json.dumps({'id': pmid, 'statuses': sorted(statuses)}, sort_keys=True) + '\n')


def generate_compendium(concordances, identifiers, titles, publication_compendium, icrdf_filename):
"""
Generate a Publication compendium using the ID and Concord files provided.

:param concordances: A list of concordances to use.
:param identifiers: A list of identifiers to use.
:param publication_compendium: The publication concord file to produce.
:param icrdf_filename: The ICRDF file.
"""

dicts = {}
types = {}
uniques = [PMID]

# Load PMID identifiers.
for ifile in identifiers:
print('loading', ifile)
new_identifiers, new_types = read_identifier_file(ifile)
glom(dicts, new_identifiers, unique_prefixes=uniques)
types.update(new_types)

# Load concordances.
for infile in concordances:
print(infile)
print('loading', infile)
pairs = []
with open(infile, 'r') as inf:
for line in inf:
x = line.strip().split('\t')
pairs.append({x[0], x[2]})
glom(dicts, pairs, unique_prefixes=uniques)

# Publications have titles, not labels. We load them here.
labels = dict()
for title_filename in titles:
print('loading titles from', title_filename)
with open(title_filename, 'r') as titlef:
for line in titlef:
id, title = line.strip().split('\t')
if id in labels:
logging.warning(
f"Duplicate title for {id}: ignoring previous title '{labels[id]}', using new title '{title}'.")
labels[id] = title

# Write out the compendium.
publication_sets = set([frozenset(x) for x in dicts.values()])
baretype = PUBLICATION.split(':')[-1]
write_compendium(publication_sets, os.path.basename(publication_compendium), PUBLICATION, labels,
icrdf_filename=icrdf_filename)
4 changes: 4 additions & 0 deletions src/prefixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,7 @@
HGNCFAMILY='HGNC.FAMILY'
PANTHERFAMILY='PANTHER.FAMILY'
COMPLEXPORTAL='ComplexPortal'

PMID = 'PMID'
DOI = 'doi'
PMC = 'PMC'
Loading