danchaltiel / edcimport Goto Github PK

View Code? Open in Web Editor NEW

0.0 0.0 1.0 9.91 MB

Import data from EDC softwares

Home Page: https://danchaltiel.github.io/EDCimport/

License: GNU General Public License v3.0

R 81.70% SAS 18.30%

edcimport's Introduction

edcimport's People

Contributors

Watchers

Forkers

oncostat

edcimport's Issues

Improve waterfall plot

If RCTLRESP is NA/NE, rc_sum not 0 but NA

add options: type = c(best_resp, worst_resp, timepoint), timepoint=NULL

y label = "Percent change from baseline at best/worst response date"
y label = "Percent change from baseline at timepoint"

New lesions: user_defined column.

`is_mixed()`

pour un dataset, retourne TRUE/FALSE
Ou alors, long_names() qui retournerait les colonnes contenant des données au format long

cf split_mixed_datasets()

Don't `use_cache` if parameters are different

Like if a cache file has splimixed=TRUE but you change that and read again.

improve/fix `check_subjid()`

Multiple things:

Fix bug when x is character and ref is numeric -> parse both to character
improve message: "Missing {n} subject ID in ..."
improve UI: if x is a dataframe, select subjid/patno column

improve `assert_no_duplicate()`

improve message
Present message:
Error in assert_no_duplicate(.) : !anyDuplicated(df[[id]]) is not TRUE
implement "by": assert_no_duplicate(., by="visit")

add `assert_no_duplicate()`

A function to assert that a table has only 1 row per patient

assert_no_duplicate = function(df, id="subjid"){
  stopifnot(!anyDuplicated(df[[id]]))
  df
}

exemple :

db_itt = df1 %>% 
  full_join(df2, by="subjid") %>%
  left_join(df3, by="subjid") %>% 
  assert_no_duplicate()

TODO : cli_error avec le nom de la table df incriminée

add more insight when verbose>0

with(.lookup, sum(nrow*ncol))
object.size(tm) %>% format("auto")

add an option manager

edc_options(foo="bar")

like in https://github.com/DanChaltiel/crosstable/blob/main/R/options.R

`edc_data_warn()`

library(EDCimport)
library(tidyverse)
library(cli)
tm = edc_example_mixed()
a = tm$long_pure %>% filter(val1>2)

edc_data_warn = function(df, message, issue_n=NULL){
  if(nrow(df)>0){
    if(is.null(issue_n)) issue_n = "xx"
    else if(is.numeric(issue_n)) issue_n = str_pad(issue_n, width=2, pad="0")
    subj = df %>% pull(get_subjid_cols()) %>% unique() %>% sort()
    message = cli::format_inline(message)
    cli_warn("Issue #{col_green(issue_n)}: {message} (Patient{?s} {subj})")
  }
}


edc_data_warn(a, "{.val val1} should be lesser than 1.1", issue_n=1)
edc_data_warn(a, "{.q val1} should be lesser than 1.1", issue_n=10)
edc_data_warn(a, "{col_red('val1')} should be lesser than 1.1", issue_n=100)

`assert_no_rows()`

assert_no_rows = function(df, msg=NULL){
  if(nrow(df)>0){
    if(is.null(msg)) msg = "Dataframe should have no rows but has {nrow(df)}."
    cli_abort(msg)
  }
  invisible(df)
}

`split_mixed_datasets()` doesn't preserve labels

It should.

See crosstable::copy_label_from()

warning in `find_keyword()`

find_keyword("www")
Warning: Unknown or uninitialised column: `invalid`.

support TRUE/FALSE and 0/1 in fct_yesno

yesno = function(x) {
  levels = if(is.factor(x) || is.character(x)) c("Yes", "No") else c(TRUE, FALSE)
  factor(x, levels=levels, labels=c("Yes", "No"))
}

Support multiple keys in `split_mixed_datasets()`

e.g. id=c(subjid, visitn)

How to manage the output though?
Change the name?

check_invalid_utf8

#check_invalid_utf8 = function()...
.lookup %>% 
  arrange(desc(nrow)) %>% 
  unnest(c(names, labels)) %>% 
  mutate(
    invalid=is_invalid_utf8(labels)
  ) %>% 
  filter(invalid) %>% {
    bad_utf8 = glue("{.$dataset}${.$names} ({.$labels}) ") %>% set_names("i")
    if(nrow(.))
    cli_warn(c("Found {length(bad_utf8)} invalid UTF-8 label{?s}", bad_utf8))
  }

ultimate swimerplot

x = .lookup$dataset %>%
  set_names() %>% 
  map(~{
    a = get(.x) %>% 
      select(subjid, where(is.Date))
    if(ncol(a)<2) return(NULL)
    a %>% 
      pivot_longer(-subjid) %>% 
      mutate(label=get_label(a)[name],
             variable=paste0(toupper(.x), " - ", toupper(name)))
  })

list_rbind(x) %>% 
  mutate(subjid=factor(subjid)) %>% 
  ggplot(aes(x=value, y=subjid, color=label, group=subjid, label=variable)) + 
  # geom_line() +
  geom_point()

plotly::ggplotly()

TODO: detect columns that could have been dates but are not, like 2023/02/UK or NA/NA/NA

Allow dataset selection in `read_trialmaster(split_mixed=))`

read_trialmaster(split_mixed=c("dataset1", "dataset2"))

Can we use tidyselection here?

add `check_subjid()`

A function to check that a table is not missing some patients.

check_subjid = function(x, ref=enrolres$subjid){
  m = setdiff(ref, x$subjid) %>% sort()
  if(length(m)>0) cli_warn("Missing subjid in {.arg {rlang::caller_arg(x)}}: {.val {m}}")
}

enrolres$subjid should be automatized, during .lookup creation maybe?

better reporting of import errors

x = tm %>% map(~{
  .x %>% 
    keep(~inherits(.x, "try-error")) %>% 
    names()
}) %>% 
  compact()

#exemple:
x$gpaq
# [1] "gpaq03" "gpaq06" "gpaq09" "gpaq12" "gpaq15" "gpaq16"
cat(tm$gpaq$gpaq03[1])
#Error in abort_lossy_cast(x, to, ..., lossy = lossy) : 
#  Lossy cast from <character> to <hms> at position(s) 1

SUBJID as factor

To avoid type errors during joins, subjid should maybe always be a factor.
Users would have to set the name of the ENROLRES table (in read_tm()) so that the levels are the right ones.

`edc_options()` doesn't work with external variables

And with the prefix also??

library(EDCimport)
getOption("edc_subjid_ref")
#> NULL
x=2
edc_options(edc_subjid_ref=x)
getOption("edc_subjid_ref")
#> NULL
edc_options(edc_subjid_ref=1)
getOption("edc_subjid_ref")
#> [1] 1
EDCimport::edc_options(edc_subjid_ref=1)
#> Error in (function (...) : invalid argument

^{Created on 2024-03-27 with reprex v2.1.0}

`followup_table()` to get max date per patient through all tables

followup_table = function() {

get_datasets() %>% 
  map(~{
    if(!is.data.frame(.x) || !"subjid" %in% names(.x)) return(NULL)
    a = .x %>% select(subjid, where(is.Date))
    if(ncol(a)<2) return(NULL)
    a %>% pivot_longer(-subjid) %>% filter(!is.na(value))
  }) %>% 
  discard(is.null) %>% 
  list_rbind() %>% 
  slice_max(value, by=subjid, with_ties=FALSE) %>% 
  arrange(subjid)

}

Error in `get_key_cols()` when a dataset has several id columns

EDCimport/R/helpers.R

Line 285 in cb9c347

patient_id=map_chr(names, ~.x[tolower(.x) %in% tolower(patient_id)] %0% NA),

Adding [1] will do:

  rtn = lookup %>% 
    select(dataset, names) %>% 
    mutate(
      patient_id=map_chr(names, ~.x[tolower(.x) %in% tolower(patient_id)][1] %0% NA), 
      crfname=map_chr(names, ~.x[tolower(.x) %in% tolower(crfname)][1] %0% NA)
    )

str_trim(keyword) in `find_keyword()`

Also simplify ignore_case management?
Also add the percent of missing, like this:

find_keyword("date") %>% 
  mutate(nna = map2_dbl(dataset, names, ~{x=get(.x)[[.y]];mean(is.na(x))})) %>% 
  arrange(nna)

Implement AE butterfly plots

https://ascopubs.org/doi/10.1200/JCO.19.00915

Fig 3. Adverse events. The panel on the left is a butterfly plot showing the proportion of patients experiencing an adverse event, whatever the grade (light red for the busulfan and melphalan [BuMel] arm and light blue for the vincristine, dactinomycin, and ifosfamide [VAI] plus whole-lung irradiation [WLI] arm) and a severe adverse event (dark red for the BuMel arm and dark blue for VAI plus WLI arm) according to the randomization group. The panel on the right displays the relative risk of a severe adverse event in patients with BuMel relative to patients with VAI plus WLI, with 95% CIs for a 2 × 2 table. The acute toxicity related to chemotherapy was assessed after each course, using a list of 22 selected items from the National Cancer Institute Common Terminology Criteria for Adverse Events (version 2.0). A modified list of items was used to evaluate toxicity after radiotherapy, using Radiation Therapy Oncology Group classification for eight types of specific toxicities. A free text area was available to document other adverse reactions. The toxicity items were then pooled by category: bladder toxicity, cardiac toxicity, GI toxicity, general deterioration, hematologic toxicity, infection, liver toxicity, lung toxicity, neurologic toxicity (including mood alteration), renal toxicity, and skin toxicity. The respiratory tract toxicity (larynx, pharynx, salivary gland) reported after radiotherapy was pooled within the category of GI toxicity because of small numbers and because they were usually associated. Details are provided in the Data Supplement. For each adverse event type, the analysis was based on the maximum grade observed over the whole maintenance treatment duration. A grade 4 hematologic toxicity and a grade 3 or higher nonhematologic toxicity were classified as severe toxicities. The categories of adverse events was ordered by decreasing value of the relative risk of severe toxicity. This analysis was performed on the safety set (127 patients taking VAI plus WLI and 117 patients taking BuMel), excluding patients who did not receive the treatment allocated by randomization (as-treated population), as well as patients with missing data for toxicity assessment. The number of chemotherapy courses followed by toxicity over the whole maintenance treatment duration is detailed in the Data Supplement.

swimmerplot par centre

Implement `compare_tm()`

tm1 = read_trialmaster()
tm2 = read_trialmaster()

compare_tm(tm1, tm2)

New tables, differences de remplissage, missing rates...

add `fct_yesno()`

Useful to relevel factors so that Yes is the first level.

fct_yesno = function(x, lvl=c("Yes", "No", NA), mutate_character=TRUE){
  if(!is.factor(x) && !is.character(x)) return(x)
  if(is.character(x) && isFALSE(mutate_character)) return(x)
  if(!all(x %in% lvl)) return(x)
  factor(x, levels=lvl) %>% copy_label_from(x)
}

Apply that to all columns?

check that all options are handled

#' @noRd
#' @keywords internal
missing_options_helper = function(){
  options_found = dir("R/", full.names=T) %>% 
    map(readLines) %>% 
    map(~str_subset(.x, "getOption")) %>% 
    keep(~length(.x)>0) %>% 
    unlist() %>% 
    str_extract_all("getOption\\((.*?)\\)") %>% unlist() %>% 
    str_extract("getOption\\((.*?)(,(.*))?\\)", group=1) %>% 
    unique() %>% 
    str_subset('"') %>% 
    str_remove_all('"')
  
  options_proposed = names(formals(edc_options))
  
  options_found %>% setdiff(options_proposed)
}


#in tests
test_that("No missing options", {
  missing_options = missing_options_helper()
  expect_identical(missing_options, character(0))
})

add `save_plotly()` helper

Propose

save_plotly = function(p, file, ...){
  if(inherits(p, "ggplot")) p = plotly::ggplotly(p)
  wd = setwd(dirname(file))
  on.exit(setwd(wd))
  htmlwidgets::saveWidget(p, file=basename(file), ...)
}

See https://github.com/ramnathv/ htmlwidgets/issues/296

`manual_correction()` is not really usable

Current use:

l = with(data, which(subjid==111 & dqlq=="2020/03"))
manual_correction(data, dqlq, rows=l, wrong="2020/03", correct="2021/03")

wrong = ymd("2011/11/29", "2021/01/12", "2019/12/11", "2020/12/28")
l=which(data$tc_dt %in% wrong & data$subjid  %in% c(128,196,17,73))
wrong = data$tc_dt[l]
manual_correction(data, tc_dt, rows=l, wrong=wrong, correct=rep(NA, 4))

Problem: when the database is corrected, wrong is not of the right length (eventually 0).

Proposition:

manual_correction(data, tc_dt, predicate=tc_dt %in% wrong & subjid  %in% c(128,196,17,73), correct=rep(NA, 4))

With predicate to be evaluated within data.