Skip to content

Commit

Permalink
information content enhancements (#781)
Browse files Browse the repository at this point in the history
* information content enhancements

* ruff

* ruff
  • Loading branch information
cmungall authored Jul 6, 2024
1 parent b32706b commit 7774d90
Show file tree
Hide file tree
Showing 15 changed files with 2,277 additions and 747 deletions.
718 changes: 718 additions & 0 deletions notebooks/GO/Edge-IC-Analysis.ipynb

Large diffs are not rendered by default.

528 changes: 528 additions & 0 deletions notebooks/GO/Edge-Information-Analysis.ipynb

Large diffs are not rendered by default.

1,430 changes: 714 additions & 716 deletions poetry.lock

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ funowl = ">=0.2.0"
gilda = {version = ">=1.0.0", optional = true}
semsimian = {version = ">=0.2.16", optional = true}
kgcl-rdflib = "0.5.0"
llm = {version = "*", optional = true}
llm = "^0.14"
html2text = {version = "*", optional = true}
aiohttp = {version = "*", optional = true}
pystow = ">=0.5.0"
Expand All @@ -34,16 +34,16 @@ ontoportal-client = ">=0.0.3"
prefixmaps = ">=0.1.2"
ols-client = ">=0.1.1"
airium = ">=0.2.5"
ndex2 = "^3.5.0"
ndex2 = ">=3.5.0"
pysolr = "^3.9.0"
eutils = ">=0.6.0"
requests-cache = "^1.0.1"
click = "*"
urllib3 = {version = "< 2", optional = true}
pydantic = "*"
jsonlines = "^4.0.0"
tenacity = "^8.2.3"
defusedxml = "^0.7.1"
jsonlines = "*"
tenacity = ">=8.2.3"
defusedxml = ">=0.7.1"


[tool.poetry.dev-dependencies]
Expand Down
39 changes: 30 additions & 9 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2744,21 +2744,28 @@ def information_content(
writer.file = output
if not isinstance(impl, SemanticSimilarityInterface):
raise NotImplementedError(f"Cannot execute this with {type(impl)}")
if len(terms) == 0:
raise ValueError("You must specify a list of terms. Use '.all' for all terms")
actual_predicates = _process_predicates_arg(predicates)
n = 0
logging.info("Fetching ICs...")
for curie_it in chunk(query_terms_iterator(terms, impl)):
logging.info("** Next chunk:")
n += 1
if terms:
for curie_it in chunk(query_terms_iterator(terms, impl)):
logging.info("** Next chunk:")
for curie, ic in impl.information_content_scores(
curie_it,
object_closure_predicates=actual_predicates,
use_associations=use_associations,
):
obj = dict(id=curie, information_content=ic)
writer.emit(obj)
n += 1
else:
for curie, ic in impl.information_content_scores(
curie_it,
object_closure_predicates=actual_predicates,
use_associations=use_associations,
):
obj = dict(id=curie, information_content=ic)
writer.emit(obj)
n += 1
if n == 0:
raise ValueError(f"No results for input: {terms}")
writer.finish()
Expand Down Expand Up @@ -6572,9 +6579,17 @@ def generate_lexical_replacements(
"--patch-format",
help="Output syntax for patches.",
)
@click.option(
"--exclude-defined/--no-exclude-defined",
default=False,
show_default=True,
help="Exclude terms that already have definitions",
)
@output_option
@output_type_option
def generate_definitions(terms, apply_patch, patch, patch_format, output, output_type, **kwargs):
def generate_definitions(
terms, apply_patch, patch, patch_format, output, output_type, exclude_defined, **kwargs
):
"""
Generate definitions for a term or terms.
Expand Down Expand Up @@ -6606,8 +6621,14 @@ def generate_definitions(terms, apply_patch, patch, patch_format, output, output
writer.output = output
if not isinstance(impl, OntologyGenerationInterface):
raise NotImplementedError
all_terms = query_terms_iterator(terms, impl)
curie_defns = impl.generate_definitions(list(all_terms), **kwargs)
all_terms = list(query_terms_iterator(terms, impl))
logging.info(f"Generating definitions for {len(all_terms)} terms")
if exclude_defined:
exclusion_list = [x[0] for x in impl.definitions(all_terms)]
logging.info(f"Excluding {len(exclusion_list)} terms that already have definitions")
all_terms = list(set(all_terms) - set(exclusion_list))
logging.info(f"Generating definitions for final list of {len(all_terms)} terms")
curie_defns = impl.generate_definitions(all_terms, **kwargs)
change_list = []
for curie, defn in curie_defns:
change = kgcl.NewTextDefinition(
Expand Down
18 changes: 14 additions & 4 deletions src/oaklib/implementations/agrkb/agrkb_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,25 @@ def ontologies(self) -> Iterable[CURIE]:
def node(
self, curie: CURIE, strict=False, include_metadata=False, expand_curies=False
) -> obograph.Node:
"""
Get a node by CURIE.
Currently the only node type supported is a gene.
:param curie:
:param strict:
:param include_metadata:
:param expand_curies:
:return:
"""

session = self.requests_session()
url = f"{BASE_URL}/gene/{curie }"
url = f"{BASE_URL}/gene/{curie}"
response = session.get(url)
if response.status_code == 500 and not strict:
return obograph.Node(id=curie)
if response.status_code != 200:
raise ValueError(
f"Error fetching issues: {response.status_code} from {url} // {response.text}"
)
return obograph.Node(id=curie)
obj = response.json()
meta = obograph.Meta()
defn = obj.get("geneSynopsis", None)
Expand Down
65 changes: 61 additions & 4 deletions src/oaklib/implementations/amigo/amigo_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import json
import logging
import math
from dataclasses import dataclass
from time import sleep
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
Expand All @@ -21,6 +22,7 @@
]

from oaklib.interfaces.basic_ontology_interface import LANGUAGE_TAG, RELATIONSHIP
from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface
from oaklib.interfaces.usages_interface import UsagesInterface
from oaklib.types import CURIE, PRED_CURIE, SUBSET_CURIE
from oaklib.utilities.iterator_utils import chunk
Expand All @@ -32,6 +34,7 @@
LIMIT = 10000

ONTOLOGY_CLASS_CATEGORY = "ontology_class"
BIOENTITY_CATEGORY = "ontology_class"

# TODO: derive from schema
DOCUMENT_CATEGORY = "document_category"
Expand Down Expand Up @@ -132,6 +135,7 @@ class AmiGOImplementation(
AssociationProviderInterface,
SearchInterface,
UsagesInterface,
SemanticSimilarityInterface,
):
"""
Wraps AmiGO endpoint.
Expand Down Expand Up @@ -260,8 +264,8 @@ def associations(
aggregator_knowledge_source="infores:go",
)
if add_closure_fields:
assoc.subject_closure = doc[ISA_PARTOF_CLOSURE]
assoc.subject_closure_label = doc[ISA_PARTOF_CLOSURE_LABEL]
assoc.object_closure = doc[ISA_PARTOF_CLOSURE]
assoc.object_closure_label = doc[ISA_PARTOF_CLOSURE_LABEL]
yield assoc

def _association_query(
Expand All @@ -274,9 +278,10 @@ def _association_query(
predicate_closure_predicates: Optional[List[PRED_CURIE]] = None,
object_closure_predicates: Optional[List[PRED_CURIE]] = None,
include_modified: bool = False,
document_category: str = "annotation",
**kwargs,
) -> Dict[str, Any]:
fq = {DOCUMENT_CATEGORY: ["annotation"]}
fq = {DOCUMENT_CATEGORY: [document_category]}
if subjects:
subjects = [_unnnormalize(s) for s in subjects]
fq[BIOENTITY] = subjects
Expand Down Expand Up @@ -311,7 +316,6 @@ def association_counts(
>>> adapter = get_adapter("amigo:NCBITaxon:9606")
>>> for term, count in adapter.association_counts(group_by="object"):
... print(f"Term: {term} Approx Count: {int(count / 1000) * 1000)}")
xxx
:param subjects:
:param predicates:
Expand Down Expand Up @@ -456,3 +460,56 @@ def basic_search(

for doc in results:
yield doc["entity"]

def information_content_scores(
self,
curies: Optional[Iterable[CURIE]] = None,
predicates: List[PRED_CURIE] = None,
object_closure_predicates: List[PRED_CURIE] = None,
use_associations: bool = None,
term_to_entities_map: Dict[CURIE, List[CURIE]] = None,
**kwargs,
) -> Iterator[Tuple[CURIE, float]]:
if curies and not isinstance(curies, list):
curies = list(curies)
fq = self._association_query(
predicates=predicates,
object_closure_predicates=object_closure_predicates,
# objects=curies,
document_category="bioentity",
)
solr = self._solr
n_bioentities = None
for term, count in _faceted_query(
solr,
fq,
facet_field=DOCUMENT_CATEGORY,
rows=0,
facet_limit=-1,
min_facet_count=1,
**kwargs,
):
if term == "bioentity":
n_bioentities = count
if n_bioentities is None:
raise ValueError("No bioentities found")
kwargs = {}
# if curies:
# kwargs["facet.query"] = [_fq_element(ISA_PARTOF_CLOSURE, curie) for curie in curies]
n = 0
for term, count in _faceted_query(
solr,
fq,
facet_field=ISA_PARTOF_CLOSURE,
rows=0,
facet_limit=-1,
min_facet_count=1,
**kwargs,
):
n += 1
if curies and term not in curies:
continue
ic = -math.log(count / n_bioentities) / math.log(2)
yield term, ic

logger.info(f"Iterated {n} counts")
19 changes: 14 additions & 5 deletions src/oaklib/implementations/llm_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
SeverityOptions,
ValidationConfiguration,
)
from oaklib.datamodels.vocabulary import HAS_DBXREF, HAS_DEFINITION_CURIE, SKOS_EXACT_MATCH
from oaklib.datamodels.vocabulary import HAS_DBXREF, HAS_DEFINITION_CURIE, IS_A, SKOS_EXACT_MATCH
from oaklib.interfaces import (
MappingProviderInterface,
OboGraphInterface,
Expand Down Expand Up @@ -187,7 +187,7 @@ class LLMImplementation(
model: "llm.Model" = None
"""The LLM model to use."""

default_model_id: str = "gpt-4-turbo"
default_model_id: str = "gpt-4o"

allow_direct_grounding: bool = False
"""The point of this implementation is to perform NER and delegate to a grounded."""
Expand Down Expand Up @@ -241,6 +241,9 @@ def basic_search(self, *args, **kwargs) -> Iterator[CURIE]:
def label(self, *args, **kwargs) -> Optional[str]:
return self.wrapped_adapter.label(*args, **kwargs)

def definitions(self, *args, **kwargs):
yield from self.wrapped_adapter.definitions(*args, **kwargs)

def descendants(
self,
*args,
Expand Down Expand Up @@ -408,14 +411,20 @@ def generate_definitions(
model = self.get_model()
if not isinstance(wrapped_adapter, OboGraphInterface):
raise NotImplementedError("LLM can only suggest definitions for OBO graphs")
if style_hints is None:
style_hints = ""
for curie in curies:
node = wrapped_adapter.node(curie)
info = f"id: {curie}\n"
info += f"label: {node.lbl}\n"
for _, p, o in wrapped_adapter.relationships(curie):
p_label = "is_a" if p == IS_A else wrapped_adapter.label(p)
o_label = wrapped_adapter.label(o)
if p_label and o_label:
info += f"{p_label}: {o_label}\n"
system_prompt = "Provide a textual definition for the given term."
system_prompt += style_hints
if style_hints:
system_prompt += " " + style_hints
logger.debug(f"System: {system_prompt}")
logger.debug(f"Prompt: {info}")
response = model.prompt(info, system=system_prompt).text()
yield curie, DefinitionPropertyValue(val=response)

Expand Down
2 changes: 2 additions & 0 deletions src/oaklib/implementations/quickgo/quickgo_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ def node(
return node

def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]:
if curie.startswith("biolink:"):
return None
try:
node = self.node(curie)
if node:
Expand Down
2 changes: 1 addition & 1 deletion src/oaklib/implementations/sqldb/sql_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ class SqlImplementation(
_information_content_cache: Dict[Tuple, float] = None
_relationships_by_subject_index: Dict[CURIE, List[RELATIONSHIP]] = None

max_items_for_in_clause: int = field(default_factory=lambda: 1000)
max_items_for_in_clause: int = field(default_factory=lambda: 100)

can_store_associations: bool = False
"""True if the underlying sqlite database has term_association populated."""
Expand Down
2 changes: 1 addition & 1 deletion src/oaklib/interfaces/basic_ontology_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1330,7 +1330,7 @@ def definitions(
:param include_metadata: if true, include metadata
:param include_missing: if true, include curies with no definition
:param lang: language tag
:return: iterator over definition objects
:return: iterator over (id, definition, metadata) tuples
"""
if include_metadata:
raise NotImplementedError()
Expand Down
14 changes: 13 additions & 1 deletion src/oaklib/interfaces/semsim_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from oaklib.interfaces.basic_ontology_interface import BasicOntologyInterface
from oaklib.interfaces.obograph_interface import OboGraphInterface
from oaklib.types import CURIE, PRED_CURIE
from oaklib.utilities.iterator_utils import chunk
from oaklib.utilities.obograph_utils import as_digraph
from oaklib.utilities.semsim.similarity_utils import setwise_jaccard_similarity

Expand Down Expand Up @@ -193,7 +194,7 @@ def get_information_content(

def information_content_scores(
self,
curies: Iterable[CURIE],
curies: Optional[Iterable[CURIE]] = None,
predicates: List[PRED_CURIE] = None,
object_closure_predicates: List[PRED_CURIE] = None,
use_associations: bool = None,
Expand All @@ -220,6 +221,17 @@ def information_content_scores(
:param kwargs:
:return:
"""
if curies is None:
for curie_it in chunk(self.entities()):
yield from self.information_content_scores(
curie_it,
predicates=predicates,
object_closure_predicates=object_closure_predicates,
use_associations=use_associations,
term_to_entities_map=term_to_entities_map,
**kwargs,
)
return
curies = list(curies)
if self.cached_information_content_map is None and use_associations:
logging.info("Calculating and caching IC map from associations")
Expand Down
15 changes: 15 additions & 0 deletions src/oaklib/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,18 @@ def chain_results(v):
subset = query_terms[0]
query_terms = query_terms[1:]
chain_results(adapter.subset_members(subset))
elif term.startswith(".root"):
logging.debug(f"Roots: {term}")
params = _parse_params(term)
this_predicates = params.get("predicates", predicates)
roots = adapter.roots(predicates=this_predicates)
chain_results(roots)
elif term.startswith(".leaf"):
logging.debug(f"Leafs: {term}")
params = _parse_params(term)
this_predicates = params.get("predicates", predicates)
leafs = adapter.leafs(predicates=this_predicates)
chain_results(leafs)
elif term.startswith(".is_obsolete"):
logging.debug("Obsolete")
chain_results(adapter.obsoletes())
Expand Down Expand Up @@ -766,6 +778,9 @@ def curies_from_file(
if allow_labels and not adapter:
raise ValueError("Must provide an adapter to resolve labels")
for line in file.readlines():
line = line.strip()
if not line:
continue
line_no += 1
if ":" in line or not allow_labels:
m = re.match(r"^(\S+)", line)
Expand Down
Loading

0 comments on commit 7774d90

Please sign in to comment.