Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added CAS, EC, GSBL, RTECS search options to get_etoxid() #241

Merged
merged 6 commits into from
Apr 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Title: Chemical Information from the Web
Description: Chemical information from around the web. This package interacts
with a suite of web APIs for chemical information.
Type: Package
Version: 0.5.0.9000
Version: 0.5.0.9001
Date: 2020-02-29
License: MIT + file LICENSE
URL: https://docs.ropensci.org/webchem, https://github.com/ropensci/webchem
Expand Down
3 changes: 2 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ webchem 0.5.0.9001

NEW FEATURES

* get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar]

* nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"`

MINOR IMPROVEMENTS
Expand All @@ -11,7 +13,6 @@ MINOR IMPROVEMENTS

BUG FIXES


* nist_ri() returned malformed tables or errored if there was only one entry for a query
* get_csid() now returns all csids when queried from formula
* get_csid() returned an error when query was NA [PR #226, fixed by stitam]
Expand Down
129 changes: 75 additions & 54 deletions R/etox.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#' @importFrom stats rgamma
#' @importFrom dplyr bind_rows
#' @param query character; The searchterm
#' @param from character; Type of input, can be one of "name" (chemical name),
#' "cas" (CAS Number), "ec" (European Community number for regulatory purposes),
#' "gsbl" (Identifier used by \url{https://www.gsbl.de}) and "rtecs" (Identifier used
#' by the Registry of Toxic Effects of Chemical Substances database).
#' @param match character; How should multiple hits be handeled? "all" returns
#' all matched IDs, "first" only the first match, "best" the best matching (by
#' name) ID, "ask" is a interactive mode and the user is asked for input, "na"
Expand All @@ -21,6 +25,7 @@
#' \code{\link{etox_tests}} for test results.
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @author Andreas Scharmueller, \email{andschar@@protonmail.com}
#' @export
#' @examples
#' \dontrun{
Expand All @@ -30,8 +35,12 @@
#' comps <- c("Triclosan", "Glyphosate")
#' get_etoxid(comps)
#' get_etoxid(comps, match = "all")
#' get_etoxid("34123-59-6", from = "cas") # Isoproturon
#' get_etoxid("133483", from = "gsbl") # 3-Butin-1-ol
#' get_etoxid("203-157-5", from = "ec") # Paracetamol
#' }
get_etoxid <- function(query,
from = c("name", "cas", "ec", "gsbl", "rtecs"),
Aariq marked this conversation as resolved.
Show resolved Hide resolved
match = c("best", "all", "first", "ask", "na"),
verbose = TRUE) {
clean_char <- function(x) {
Expand All @@ -43,81 +52,93 @@ get_etoxid <- function(query,
x <- gsub("(?<=[\\s])\\s*|^\\s+$", "", x, perl = TRUE)
return(x)
}
# checks
from <- match.arg(from)
match <- match.arg(match)
foo <- function(query, match, verbose) {
foo <- function(query, from, match, verbose) {
if (verbose)
message("Searching ", query)
baseurl <- "https://webetox.uba.de/webETOX/public/search/stoff.do"
if (from == 'name') {
Aariq marked this conversation as resolved.
Show resolved Hide resolved
body <- list("stoffname.selection[0].name" = query,
"stoffname.selection[0].type" = "",
stitam marked this conversation as resolved.
Show resolved Hide resolved
event = "Search")
} else {
from_look <- c(cas = 69,
ec = 70,
rtecs = 72,
gsbl = 73)
type <- from_look[ names(from_look) == from ]
body <- list('stoffnummer.selection[0].name' = query,
'stoffnummer.selection[0].type' = type,
event = "Search")
}
Sys.sleep(rgamma(1, shape = 15, scale = 1/10))
h <- POST(url = baseurl, body = list("stoffname.selection[0].name" = query,
event = "Search"))
# get substances and links
h <- POST(url = baseurl,
handle = handle(''),
body = body)
tt <- read_html(h)
subs <- clean_char(xml_text(xml_find_all(
tt, "//*/table[@class = 'listForm resultList']//a")))[-1]
if (length(subs) == 0) {
if (verbose)
message("Substance not found! Returning NA. \n")
message("Substance not found! Returning NA.")
id <- NA
matched_sub <- NA
d <- NA
}
if (length(subs) > 0) {
# types can be ETOX_NAME, SYNONYM
# the function currently uses all types, there is no filtering
type <- clean_char(xml_text(xml_find_all(
tt, "//*/table[@class = 'listForm resultList']/tr/td[2]")))
if (!'ETOX_NAME' %in% type) {
warning('No ETOX_NAME found. Returning SYNONYM.')
}
stitam marked this conversation as resolved.
Show resolved Hide resolved
links <- xml_attr(xml_find_all(
tt, "//*/table[@class = 'listForm resultList']//a"), "href")[-1]
}
if (length(subs) == 1) {
id <- gsub("^.*\\?id=(.*)", "\\1", links)
d <- ifelse(match == "best", 0, as.character(0))
matched_sub <- subs[1]
}
# multiple hits
if (length(subs) > 1) {
if (verbose)
message("More then one Link found.")
if (match == "na") {
if (verbose)
message("Returning NA.")
id <- NA
matched_sub <- NA
d <- "na"
}
# multiple hits
if (length(subs) == 1) {
if (match == "all") {
if (verbose)
message("Returning all matches.")
id <- gsub("^.*\\?id=(.*)", "\\1", links)
d <- ifelse(match == "best", 0, as.character(0))
matched_sub <- subs[1]
matched_sub <-
subs[sapply(id, function(x)
grep(x, subs)[1])]
d <- "all"
}
if (match == "first") {
if (verbose)
message("Returning first match.")
id <- gsub("^.*\\?id=(.*)", "\\1", links[1])
matched_sub <- subs[grep(id[1], subs)[1]]
d <- "first"
}
if (length(subs) > 1){
if (match == "best") {
if (verbose)
message("More then one Link found. \n")
if (match == "na") {
if (verbose)
message("Returning NA. \n")
id <- NA
matched_sub <- NA
d <- "na"
}
if (match == "all") {
if (verbose)
message("Returning all matches. \n")
id <- gsub("^.*\\?id=(.*)", "\\1", links)
matched_sub <- subs[sapply(id, function(x) grep(x, subs)[1])]
d <- "all"
}
if (match == "first") {
if (verbose)
message("Returning first match. \n")
id <- gsub("^.*\\?id=(.*)", "\\1", links[1])
matched_sub <- subs[grep(id[1], subs)[1]]
d <- "first"
}
if (match == "best") {
if (verbose)
message("Returning best match. \n")
msubs <- gsub(" \\(.*\\)", "", subs)
dd <- adist(query, msubs) / nchar(msubs)
id <- gsub("^.*\\?id=(.*)", "\\1", links[which.min(dd)])
d <- round(dd[which.min(dd)], 2)
matched_sub <- subs[which.min(dd)]
}
if (match == "ask") {
matched_sub <- chooser(subs, "all")
id <- gsub("^.*\\?id=(.*)", "\\1", links[which(subs == matched_sub)])
d <- "interactive"
}
message("Returning best match. \n")
msubs <- gsub(" \\(.*\\)", "", subs)
dd <- adist(query, msubs) / nchar(msubs)
id <- gsub("^.*\\?id=(.*)", "\\1", links[which.min(dd)])
d <- round(dd[which.min(dd)], 2)
matched_sub <- subs[which.min(dd)]
}
if (match == "ask") {
matched_sub <- chooser(subs, "all")
id <-
gsub("^.*\\?id=(.*)", "\\1", links[which(subs == matched_sub)])
d <- "interactive"
}
}
# return object
hit <- data.frame(
"etoxid" = id,
Expand All @@ -128,7 +149,7 @@ get_etoxid <- function(query,
)
return(hit)
}
out <- lapply(query, foo, match = match, verbose = verbose)
out <- lapply(query, foo, from = from, match = match, verbose = verbose)
out <- dplyr::bind_rows(out)
return(out)
}
Expand Down
11 changes: 11 additions & 0 deletions man/get_etoxid.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions tests/testthat/test-etox.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,27 @@ test_that("get_etoxid returns correct results", {
o2 <- get_etoxid(comps, match = "all")
o3 <- get_etoxid("Triclosan", match = "first")
o4 <- get_etoxid("Triclosan", match = "na")
o5 <- get_etoxid("1071-83-6", from = 'cas', match = 'best')
o6 <- get_etoxid("133483", from = "gsbl")
o7 <- get_etoxid("203-157-5", from = "ec")
do2 <- get_etoxid("Thiamethoxam")

expect_is(o1, "data.frame")
expect_is(o2, "data.frame")
expect_is(o3, "data.frame")
expect_is(o4, "data.frame")
expect_is(o5, "data.frame")
expect_is(o6, "data.frame")
expect_is(o7, "data.frame")
expect_is(do2, "data.frame")

expect_equal(o1$etoxid, c("20179", "9051"))
expect_equal(o2$etoxid, c("89236", "20179", "9051"))
expect_equal(o3$distance, "first")
expect_equal(do2$distance, 0)
expect_equal(o5$etoxid, "7419")
expect_equal(o6$etoxid, "97558")
expect_equal(o7$etoxid, "12416")

# only synonyms found
expect_warning(get_etoxid("Tetracyclin"))
Expand Down