clinTrialData is a community-grown
library of clinical trial example datasets for R. The package
ships with a core set of studies and is designed to expand over time —
anyone can contribute a new data source, and users can download any
available study on demand without waiting for a new package release.
Data is stored in Parquet format and accessed through the
connector package, giving a consistent API regardless of
which study you are working with.
Key features:
download_study() to fetch any available study and cache it
locallyconnect_clinical_data() to connect to any available data
sourcelist_data_sources() finds all studies on your machine;
list_available_studies() shows everything available to
downloadThe package bundles the CDISC Pilot 01 study, so you can connect immediately:
# Connect to CDISC Pilot data
db <- connect_clinical_data("cdisc_pilot")
#> ℹ Replace some metadata informations...
#> ────────────────────────────────────────────────────────────────────────────────
#> Connection to:
#> → adam
#> • connector_fs
#> • /Users/lovemore.gakavagmail.com/Library/Caches/org.R-project.R/R/clinTrialData/cdisc_pilot/adam
#> ────────────────────────────────────────────────────────────────────────────────
#> Connection to:
#> → sdtm
#> • connector_fs
#> • /Users/lovemore.gakavagmail.com/Library/Caches/org.R-project.R/R/clinTrialData/cdisc_pilot/sdtm
# List available datasets in the ADaM domain
db$adam$list_content_cnt()
#> [1] "adae.parquet" "adlb.parquet" "adlbc.parquet" "adlbh.parquet"
#> [5] "adlbhy.parquet" "adqsadas.parquet" "adqscibc.parquet" "adqsnpix.parquet"
#> [9] "adsl.parquet" "adtte.parquet" "advs.parquet"
# Read the subject-level dataset
adsl <- db$adam$read_cnt("adsl")
#> → Found one file: '/Users/lovemore.gakavagmail.com/Library/Caches/org.R-project.R/R/clinTrialData/cdisc_pilot/adam/adsl.parquet'
head(adsl[, c("USUBJID", "TRT01A", "AGE", "SEX", "RACE")])
#> # A tibble: 6 × 5
#> USUBJID TRT01A AGE SEX RACE
#> <chr> <chr> <dbl> <chr> <chr>
#> 1 01-701-1015 Placebo 63 F WHITE
#> 2 01-701-1023 Placebo 64 M WHITE
#> 3 01-701-1028 Xanomeline High Dose 71 M WHITE
#> 4 01-701-1033 Xanomeline Low Dose 74 M WHITE
#> 5 01-701-1034 Xanomeline High Dose 77 F WHITE
#> 6 01-701-1047 Placebo 85 F WHITEStudies beyond the bundled data can be downloaded from GitHub Releases:
# Dimensions
dim(adsl)
#> [1] 254 48
# Quick structure overview
str(adsl, list.len = 10)
#> tibble [254 × 48] (S3: tbl_df/tbl/data.frame)
#> $ STUDYID : chr [1:254] "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" ...
#> ..- attr(*, "label")= chr "Study Identifier"
#> $ USUBJID : chr [1:254] "01-701-1015" "01-701-1023" "01-701-1028" "01-701-1033" ...
#> ..- attr(*, "label")= chr "Unique Subject Identifier"
#> $ SUBJID : chr [1:254] "1015" "1023" "1028" "1033" ...
#> ..- attr(*, "label")= chr "Subject Identifier for the Study"
#> $ SITEID : chr [1:254] "701" "701" "701" "701" ...
#> ..- attr(*, "label")= chr "Study Site Identifier"
#> $ SITEGR1 : chr [1:254] "701" "701" "701" "701" ...
#> ..- attr(*, "label")= chr "Pooled Site Group 1"
#> $ ARM : chr [1:254] "Placebo" "Placebo" "Xanomeline High Dose" "Xanomeline Low Dose" ...
#> ..- attr(*, "label")= chr "Description of Planned Arm"
#> $ TRT01P : chr [1:254] "Placebo" "Placebo" "Xanomeline High Dose" "Xanomeline Low Dose" ...
#> ..- attr(*, "label")= chr "Planned Treatment for Period 01"
#> $ TRT01PN : num [1:254] 0 0 81 54 81 0 54 54 54 0 ...
#> ..- attr(*, "label")= chr "Planned Treatment for Period 01 (N)"
#> $ TRT01A : chr [1:254] "Placebo" "Placebo" "Xanomeline High Dose" "Xanomeline Low Dose" ...
#> ..- attr(*, "label")= chr "Actual Treatment for Period 01"
#> $ TRT01AN : num [1:254] 0 0 81 54 81 0 54 54 54 0 ...
#> ..- attr(*, "label")= chr "Actual Treatment for Period 01 (N)"
#> [list output truncated]# Read adverse events data
adae <- db$adam$read_cnt("adae")
#> → Found one file: '/Users/lovemore.gakavagmail.com/Library/Caches/org.R-project.R/R/clinTrialData/cdisc_pilot/adam/adae.parquet'
head(adae[, c("USUBJID", "AEDECOD", "AESEV", "AESER")])
#> # A tibble: 6 × 4
#> USUBJID AEDECOD AESEV AESER
#> <chr> <chr> <chr> <chr>
#> 1 01-701-1015 APPLICATION SITE ERYTHEMA MILD N
#> 2 01-701-1015 APPLICATION SITE PRURITUS MILD N
#> 3 01-701-1015 DIARRHOEA MILD N
#> 4 01-701-1023 ERYTHEMA MILD N
#> 5 01-701-1023 ERYTHEMA MODERATE N
#> 6 01-701-1023 ATRIOVENTRICULAR BLOCK SECOND DEGREE MILD N# Read demographics
dm <- db$sdtm$read_cnt("dm")
#> → Found one file: '/Users/lovemore.gakavagmail.com/Library/Caches/org.R-project.R/R/clinTrialData/cdisc_pilot/sdtm/dm.parquet'
head(dm[, c("USUBJID", "ARM", "AGE", "SEX", "RACE")])
#> # A tibble: 6 × 5
#> USUBJID ARM AGE SEX RACE
#> <chr> <chr> <dbl> <chr> <chr>
#> 1 01-701-1015 Placebo 63 F WHITE
#> 2 01-701-1023 Placebo 64 M WHITE
#> 3 01-701-1028 Xanomeline High Dose 71 M WHITE
#> 4 01-701-1033 Xanomeline Low Dose 74 M WHITE
#> 5 01-701-1034 Xanomeline High Dose 77 F WHITE
#> 6 01-701-1047 Placebo 85 F WHITElibrary(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# Basic demographic summary by treatment
adsl |>
group_by(TRT01A) |>
summarise(
n = n(),
mean_age = mean(AGE, na.rm = TRUE),
female_pct = mean(SEX == "F", na.rm = TRUE) * 100,
.groups = "drop"
)
#> # A tibble: 3 × 4
#> TRT01A n mean_age female_pct
#> <chr> <int> <dbl> <dbl>
#> 1 Placebo 86 75.2 61.6
#> 2 Xanomeline High Dose 84 74.4 47.6
#> 3 Xanomeline Low Dose 84 75.7 59.5Anyone can add a new study to the library. Datasets live on GitHub Releases, not inside the package — so no pull request or CRAN submission is needed to add data.
Organise your Parquet files by domain:
your_new_study/
├── adam/
│ ├── adsl.parquet
│ └── adae.parquet
└── sdtm/
├── dm.parquet
└── ae.parquet
Open an issue to request a release slot, then use the helper script:
source("data-raw/upload_to_release.R")
# Upload the data zip
upload_study_to_release("your_new_study", tag = "v1.1.0")
# Generate and upload metadata (enables dataset_info() for your study)
generate_and_upload_metadata(
source = "your_new_study",
description = "Brief description of your study",
version = "v1.1.0",
license = "Your license here",
source_url = "https://link-to-original-data",
tag = "v1.1.0"
)