TranslatorSRI · gaurav · Apr 22, 2024 · Jan 23, 2024 · Jan 30, 2024 · Jan 30, 2024
diff --git a/Snakefile b/Snakefile
@@ -13,6 +13,7 @@ include: "src/snakefiles/taxon.snakefile"
 include: "src/snakefiles/genefamily.snakefile"
 include: "src/snakefiles/leftover_umls.snakefile"
 include: "src/snakefiles/macromolecular_complex.snakefile"
+include: "src/snakefiles/publications.snakefile"
 
 include: "src/snakefiles/reports.snakefile"
 include: "src/snakefiles/exports.snakefile"
@@ -46,6 +47,7 @@ rule all_outputs:
         config['output_directory'] + '/reports/umls_done',
         config['output_directory'] + '/reports/macromolecular_complex_done',
         config['output_directory'] + '/reports/drugchemical_done',
+        config['output_directory'] + '/reports/publications_done',
     output:
         x = config['output_directory'] + '/reports/outputs_done'
     shell:

diff --git a/config.json b/config.json
@@ -65,6 +65,8 @@
     "http://www.informatics.jax.org/marker/MGI:": "MGI"
   },
 
+  "publication_outputs": ["Publication.txt"],
+
   "geneprotein_outputs": ["GeneProtein.txt"],
   "drugchemical_outputs": ["DrugChemical.txt"],
 

diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -1,6 +1,7 @@
 import logging
 import subprocess
 import traceback
+from enum import Enum
 from ftplib import FTP
 from io import BytesIO
 import gzip
@@ -207,30 +208,47 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
     return out_file_name
 
 
+# Recursion options for pull_via_wget().
+# See https://www.gnu.org/software/wget/manual/html_node/Recursive-Download.html for wget's recursion options.
+class WgetRecursionOptions(Enum):
+    NO_RECURSION = 0                 # Don't do any recursion
+    RECURSE_SUBFOLDERS = 1           # Recurse into subfolders -- equivalent to `-np`
+    RECURSE_DIRECTORY_ONLY = 2       # Recurse through a single directory only -- equivalent to `-np -l1`
+
+
 def pull_via_wget(
         url_prefix: str,
         in_file_name: str,
         decompress=True,
         subpath:str=None,
+        outpath:str=None,
         continue_incomplete:bool=True,
+        recurse:WgetRecursionOptions = WgetRecursionOptions.NO_RECURSION,
         retries:int=10):
     """
     Download a file using wget. We call wget from the command line, and use command line options to
     request continuing incomplete downloads.
 
     :param url_prefix: The URL prefix to download.
     :param in_file_name: The filename to download -- this will be concatenated to the URL prefix. This should include
-        the compression extension (e.g. `.gz`); we will remove that extension during decompression.
+        the compression extension (e.g. `.gz`); we will remove that extension during decompression. If recursion is
+        turned on, in_file_name refers to the directory where the recursive content will be downloaded.
     :param decompress: Whether this is a Gzip file that should be decompressed after download.
     :param subpath: The subdirectory of `babel_download` where this file should be stored.
+    :param outpath: The full output directory to write this file to. Both subpath and outpath cannot be set at the same time.
     :param continue_incomplete: Should wget continue an incomplete download?
+    :param recurse: Do we want to download recursively? Should be from Wget_Recursion_Options, such as Wget_Recursion_Options.NO_RECURSION.
     :param retries: The number of retries to attempt.
     """
 
     # Prepare download URL and location
     download_dir = get_config()['download_directory']
     url = url_prefix + in_file_name
-    if subpath:
+    if subpath and outpath:
+        raise RuntimeError("pull_via_wget() cannot be called with both subpath and outpath set.")
+    elif outpath:
+        dl_file_name = outpath
+    elif subpath:
         dl_file_name = os.path.join(download_dir, subpath, in_file_name)
     else:
         dl_file_name = os.path.join(download_dir, in_file_name)
@@ -247,7 +265,18 @@ def pull_via_wget(
 
     # Add URL and output file.
     wget_command_line.append(url)
-    wget_command_line.extend(['-O', dl_file_name])
+
+    # Handle recursion options
+    match recurse:
+        case WgetRecursionOptions.NO_RECURSION:
+            # Write to a single file, dl_file_name
+            wget_command_line.extend(['-O', dl_file_name])
+        case WgetRecursionOptions.RECURSE_SUBFOLDERS:
+            # dl_file_name should be a directory name.
+            wget_command_line.extend(['--recursive', '--no-parent', '--no-directories', '--directory-prefix=' + dl_file_name])
+        case WgetRecursionOptions.RECURSE_DIRECTORY_ONLY:
+            # dl_file_name should be a directory name.
+            wget_command_line.extend(['--recursive', '--no-parent', '--no-directories', '--level=1', '--directory-prefix=' + dl_file_name])
 
     # Execute wget.
     logging.info(f"Downloading {dl_file_name} using wget: {wget_command_line}")
@@ -266,12 +295,21 @@ def pull_via_wget(
         else:
             raise RuntimeError(f"Don't know how to decompress {in_file_name}")
 
-    if os.path.isfile(uncompressed_filename):
-        file_size = os.path.getsize(uncompressed_filename)
-        if file_size > 0:
+        if os.path.isfile(uncompressed_filename):
+            file_size = os.path.getsize(uncompressed_filename)
             logging.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.")
+        else:
+            raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')
     else:
-        raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')
+        if os.path.isfile(dl_file_name):
+            file_size = os.path.getsize(dl_file_name)
+            logging.info(f"Downloaded {dl_file_name} from {url}, file size {file_size} bytes.")
+        elif os.path.isdir(dl_file_name):
+            # Count the number of files in directory dl_file_name
+            dir_size = sum(os.path.getsize(os.path.join(dl_file_name, f)) for f in os.listdir(dl_file_name) if os.path.isfile(os.path.join(dl_file_name, f)))
+            logging.info(f"Downloaded {dir_size} files from {url} to {dl_file_name}.")
+        else:
+            raise RuntimeError(f'Unknown file type {dl_file_name}')
 
 
 def sort_identifiers_with_boosted_prefixes(identifiers, prefixes):
@@ -316,7 +354,8 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
     :param synonym_list:
     :param ofname:
     :param node_type:
-    :param labels:
+    :param labels: A map of identifiers
+        Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx).
     :param extra_prefixes: We default to only allowing the prefixes allowed for a particular type in Biolink.
         If you want to allow additional prefixes, list them here.
     :param icrdf_filename: (REQUIRED) The file to read the information content from (icRDF.tsv). Although this is a
@@ -345,6 +384,12 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
     description_factory = DescriptionFactory(make_local_name(''))
     taxon_factory = TaxonFactory(make_local_name(''))
     node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes)
+
+    # Create compendia and synonyms directories, just in case they haven't been created yet.
+    os.makedirs(os.path.join(cdir, 'compendia'), exist_ok=True)
+    os.makedirs(os.path.join(cdir, 'synonyms'), exist_ok=True)
+
+    # Write compendium and synonym files.
     with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile:
         for slist in synonym_list:
             node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes)

diff --git a/src/categories.py b/src/categories.py
@@ -29,4 +29,5 @@
 PROCEDURE = 'biolink:Procedure'
 PROTEIN = 'biolink:Protein'
 PUBLICATION = 'biolink:Publication'
+JOURNAL_ARTICLE = 'biolink:JournalArticle'
 SMALL_MOLECULE = 'biolink:SmallMolecule'
diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py
@@ -0,0 +1,210 @@
+import gzip
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from pathlib import Path
+import xml.etree.ElementTree as ET
+
+from src.babel_utils import pull_via_wget, WgetRecursionOptions, glom, read_identifier_file, write_compendium
+from src.categories import JOURNAL_ARTICLE, PUBLICATION
+from src.prefixes import PMID, DOI, PMC
+
+
+def download_pubmed(download_file,
+                    pubmed_base='ftp://ftp.ncbi.nlm.nih.gov/pubmed/',
+                    pmc_base='https://ftp.ncbi.nlm.nih.gov/pub/pmc/'):
+    """
+    Download PubMed. We download both the PubMed annual baseline and the daily update files,
+    which are in the same format, but the baseline is set up at the start of the year and then
+    updates are included in the daily update files.
+
+    :param download_file: A `done` file that should be created to indicate that we are done.
+    :param pubmed_base: The PubMed base URL to download files from.
+    """
+
+    # Create directories if they don't exist.
+    os.makedirs(os.path.dirname(download_file), exist_ok=True)
+
+    # Step 1. Download all the files for the PubMed annual baseline.
+    pull_via_wget(
+        pubmed_base, 'baseline',
+        decompress=False,
+        subpath='PubMed',
+        recurse=WgetRecursionOptions.RECURSE_SUBFOLDERS)
+
+    # Step 2. Download all the files for the PubMed update files.
+    pull_via_wget(
+        pubmed_base, 'updatefiles',
+        decompress=False,
+        subpath='PubMed',
+        recurse=WgetRecursionOptions.RECURSE_SUBFOLDERS)
+
+    # Step 3. Download the PMC/PMID mapping file from PMC.
+    pull_via_wget(pmc_base, 'PMC-ids.csv.gz', decompress=True, subpath='PubMed')
+
+    # We're all done!
+    Path.touch(download_file)
+
+
+def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_file, pmid_id_file,
+                           pmid_doi_concord_file):
+    """
+    Read through the PubMed files in the baseline_dir and updatefiles_dir, and writes out label and status information.
+
+    :param baseline_dir: The PubMed baseline directory to parse.
+    :param updatefiles_dir: The PubMed updatefiles directory to parse.
+    :param titles_file: An output TSV file in the format `<PMID>\t<TITLE>`.
+    :param status_file: A JSON file containing publication status information.
+    :param pmid_doi_concord_file: A concord file in the format `<PMID>\teq\t<DOI>` and other identifiers.
+    """
+
+    # We can write labels and concords as we go.
+    with open(titles_file, 'w') as titlesf, open(pmid_id_file, 'w') as pmidf, open(pmid_doi_concord_file,
+                                                                                   'w') as concordf:
+        # Track PubMed article statuses. In theory the final PubMed entry should have all the dates, which should
+        # tell us the final status of a publication, but really we just want to know if the article has ever been
+        # marked as retracted, so instead we track every status that has ever been attached to any article. We
+        # don't have a way of tracking properties yet (https://github.com/TranslatorSRI/Babel/issues/155), so for now
+        # we write this out in JSON to the status_file.
+        pmid_status = defaultdict(set)
+
+        # Read every file in the baseline and updatefiles directories (they have the same format).
+        baseline_filenames = list(map(lambda fn: os.path.join(baseline_dir, fn), sorted(os.listdir(baseline_dir))))
+        updatefiles_filenames = list(
+            map(lambda fn: os.path.join(updatefiles_dir, fn), sorted(os.listdir(updatefiles_dir))))
+
+        for pubmed_filename in (baseline_filenames + updatefiles_filenames):
+            if not pubmed_filename.endswith(".xml.gz"):
+                logging.warning(f"Skipping non-gzipped-XML file {pubmed_filename} in PubMed files.")
+                continue
+
+            with gzip.open(pubmed_filename, 'rt') as pubmedf:
+                logging.info(f"Parsing PubMed Baseline {pubmed_filename}")
+
+                start_time = time.time_ns()
+                count_articles = 0
+                count_pmids = 0
+                count_dois = 0
+                count_pmcs = 0
+                count_titles = 0
+                file_pubstatuses = set()
+
+                # Read every XML entry from every PubMed file.
+                parser = ET.XMLPullParser(['end'])
+                for line in pubmedf:
+                    parser.feed(line)
+                    for event, elem in parser.read_events():
+                        if event == 'end' and elem.tag == 'PubmedArticle':
+                            count_articles += 1
+
+                            # Look for the pieces of information we want.
+                            pmids = elem.findall("./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']")
+                            dois = elem.findall("./PubmedData/ArticleIdList/ArticleId[@IdType='doi']")
+                            pmcs = elem.findall("./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']")
+                            titles = elem.findall('.//ArticleTitle')
+
+                            # Retrieve the PubDates containing PubStatuses.
+                            pubdates_with_pubstatus = elem.findall("./PubmedData/History/PubMedPubDate[@PubStatus]")
+                            pubstatuses = set()
+                            for pubdate in pubdates_with_pubstatus:
+                                # We ignore the dates, and instead record all the PubStatuses that a PMID has ever had.
+                                if pubdate.get('PubStatus'):
+                                    pubstatuses.add(pubdate.get('PubStatus'))
+
+                            # Write information for each PMID.
+                            for pmid in pmids:
+                                count_pmids += 1
+
+                                # Write out PMID type.
+                                pmidf.write(f"{PMID}:{pmid.text}\t{JOURNAL_ARTICLE}\n")
+
+                                # Update PMID status.
+                                pmid_status[f'{PMID}:' + pmid.text].update(pubstatuses)
+                                file_pubstatuses.update(pubstatuses)
+
+                                # Write out the titles.
+                                for title in titles:
+                                    count_titles += 1
+                                    # Convert newlines into '\n'.
+                                    title_text = title.text
+                                    if not title_text:
+                                        continue
+                                    title_text = title_text.replace('\n', '\\n')
+
+                                    titlesf.write(f"{PMID}:{pmid.text}\t{title_text}\n")
+
+                                # Write out the DOIs to the concords file.
+                                for doi in dois:
+                                    count_dois += 1
+                                    concordf.write(f"{PMID}:{pmid.text}\teq\t{DOI}:{doi.text}\n")
+
+                                # Write out the PMCIDs to the concords file.
+                                for pmc in pmcs:
+                                    count_pmcs += 1
+                                    concordf.write(f"{PMID}:{pmid.text}\teq\t{PMC}:{pmc.text}\n")
+
+                time_taken_in_seconds = float(time.time_ns() - start_time) / 1_000_000_000
+                logging.info(
+                    f"Parsed {count_articles} articles from PubMed {pubmed_filename} in " +
+                    f"{time_taken_in_seconds:.4f} seconds: {count_pmids} PMIDs, {count_dois} DOIs, " +
+                    f"{count_pmcs} PMCs, " +
+                    f"{count_titles} titles with the following PubStatuses: {sorted(file_pubstatuses)}.")
+
+    # Write the statuses into a gzipped JSONL file.
+    with gzip.open(status_file, 'wt') as statusf:
+        # This will be more readable as a JSONL file, so let's write it out that way.
+        for pmid, statuses in pmid_status.items():
+            statusf.write(json.dumps({'id': pmid, 'statuses': sorted(statuses)}, sort_keys=True) + '\n')
+
+
+def generate_compendium(concordances, identifiers, titles, publication_compendium, icrdf_filename):
+    """
+    Generate a Publication compendium using the ID and Concord files provided.
+
+    :param concordances: A list of concordances to use.
+    :param identifiers: A list of identifiers to use.
+    :param publication_compendium: The publication concord file to produce.
+    :param icrdf_filename: The ICRDF file.
+    """
+
+    dicts = {}
+    types = {}
+    uniques = [PMID]
+
+    # Load PMID identifiers.
+    for ifile in identifiers:
+        print('loading', ifile)
+        new_identifiers, new_types = read_identifier_file(ifile)
+        glom(dicts, new_identifiers, unique_prefixes=uniques)
+        types.update(new_types)
+
+    # Load concordances.
+    for infile in concordances:
+        print(infile)
+        print('loading', infile)
+        pairs = []
+        with open(infile, 'r') as inf:
+            for line in inf:
+                x = line.strip().split('\t')
+                pairs.append({x[0], x[2]})
+        glom(dicts, pairs, unique_prefixes=uniques)
+
+    # Publications have titles, not labels. We load them here.
+    labels = dict()
+    for title_filename in titles:
+        print('loading titles from', title_filename)
+        with open(title_filename, 'r') as titlef:
+            for line in titlef:
+                id, title = line.strip().split('\t')
+                if id in labels:
+                    logging.warning(
+                        f"Duplicate title for {id}: ignoring previous title '{labels[id]}', using new title '{title}'.")
+                labels[id] = title
+
+    # Write out the compendium.
+    publication_sets = set([frozenset(x) for x in dicts.values()])
+    baretype = PUBLICATION.split(':')[-1]
+    write_compendium(publication_sets, os.path.basename(publication_compendium), PUBLICATION, labels,
+                     icrdf_filename=icrdf_filename)
diff --git a/src/prefixes.py b/src/prefixes.py
@@ -72,3 +72,7 @@
 HGNCFAMILY='HGNC.FAMILY'
 PANTHERFAMILY='PANTHER.FAMILY'
 COMPLEXPORTAL='ComplexPortal'
+
+PMID = 'PMID'
+DOI = 'doi'
+PMC = 'PMC'