Complete Workflow: Combining Genetic and Non-Genetic Evidence

Franco L. Marsico

2026-01-08

Introduction

This vignette demonstrates a complete workflow for missing person identification combining genetic DNA evidence with non-genetic preliminary investigation evidence (sex, age, hair color).

The Case Scenario

A family is searching for their missing relative with the following known characteristics:

A person of interest (POI) has been found with:

We will evaluate this match using both:

  1. Non-genetic preliminary evidence
  2. Genetic DNA comparison with family reference

Setup

library(mispitools)
library(forrel)     # For genetic simulations
library(pedtools)   # For pedigree operations

Part 1: Non-Genetic Evidence

Step 1.1: Define Error Rates

Error rates (epsilon) account for uncertainty in observations:

# Probability of misrecording biological sex
eps_sex <- 0.02

# Probability of age estimate being outside true range
eps_age <- 0.05

# Hair color observation error matrix
eps_color <- error_matrix_hair(ep = 0.05)  # 5% error rate

Step 1.2: Calculate Individual LRs

Sex Evidence:

# MP is female (H=1), POI observed as female
lr_sex_result <- lr_sex(
  LR = TRUE,
  H = 1,          # True hypothesis (MP is female)
  nsims = 1,
  eps = eps_sex,
  erRs = 0.01     # Database recording error
)
#> Warning: Parameter 'nsims' is deprecated. Use 'numsims' instead.

cat("LR for sex evidence:", lr_sex_result$LRs, "\n")
#> LR for sex evidence: 1.96

Age Evidence:

# MP age = 25, tolerance = 5 years (so 20-30 is acceptable)
# POI age = 27 (falls within range)
lr_age_result <- lr_age(
  LR = TRUE,
  H = 1,          # True hypothesis
  nsims = 1,
  epa = eps_age,
  erRa = 0.01,
  MPa = 25,       # MP age
  MPr = 5         # Range tolerance
)
#> Warning: Parameter 'nsims' is deprecated. Use 'numsims' instead.

cat("LR for age evidence:", lr_age_result$LRa, "\n")
#> LR for age evidence: 7.6

Hair Color Evidence:

# MP has color 2 (dark brown), POI observed as color 2
lr_color_result <- lr_hair_color(
  LR = TRUE,
  H = 1,          # True hypothesis
  nsims = 1,
  MPc = 2,        # MP hair color
  epc = eps_color,
  erRc = eps_color
)
#> Warning: Parameter 'nsims' is deprecated. Use 'numsims' instead.

cat("LR for hair color:", lr_color_result$LRc, "\n")
#> LR for hair color: 4.840271

Step 1.3: Combined Non-Genetic LR

# Combine all non-genetic evidence
lr_nongenetic <- lr_sex_result$LRs * lr_age_result$LRa * lr_color_result$LRc
cat("Combined non-genetic LR:", round(lr_nongenetic, 2), "\n")
#> Combined non-genetic LR: 72.1
cat("Log10(LR):", round(log10(lr_nongenetic), 2), "\n")
#> Log10(LR): 1.86

Step 1.4: Visualize the CPTs

# Population CPT (H2)
cpt_h2 <- cpt_population(
  propS = c(0.5, 0.5),
  MPa = 25,
  MPr = 5,
  propC = c(0.15, 0.35, 0.25, 0.15, 0.10)  # Realistic color distribution
)

# MP CPT (H1)
cpt_h1 <- cpt_missing_person(
  MPs = 1,        # Female
  MPc = 2,        # Dark brown
  eps = eps_sex,
  epa = eps_age,
  epc = eps_color
)

# Visualize both CPTs and LR heatmap
plot_cpt(cpt_h2, cpt_h1)

Part 2: Genetic Evidence (Simulation)

For illustration, we’ll show how to simulate genetic LRs using a parent-child relationship pedigree. Note: This code is provided for reference but not executed in this vignette to avoid dependency on specific pedigree structures.

Step 2.1: Create Pedigree

# Create a simple pedigree: parent-child relationship
# The missing person (ID 5) is child of individual 2

# Using linearPed to create grandparent-parent-child
ped <- linearPed(2)  # 5 individuals

# Add genetic markers from Norwegian population
ped <- setMarkers(ped, locusAttributes = NorwegianFrequencies[1:10])

# Simulate a profile for the reference person
set.seed(123)
ped <- profileSim(ped, N = 1, ids = 2)[[1]]

Step 2.2: Simulate Genetic LRs

# Simulate genetic LRs
genetic_sims <- sim_lr_genetic(
  reference = ped,
  missing = 5,
  numsims = 100,
  seed = 456
)

# Convert to dataframe
genetic_df <- lr_to_dataframe(genetic_sims)

# Visualize
plot_lr_distribution(genetic_df)

For this demonstration, we’ll use pre-computed example values:

# Example genetic LR values (pre-computed)
# These represent typical values from parent-child testing
set.seed(42)
genetic_df <- data.frame(
  Related = 10^rnorm(100, mean = 3, sd = 1.5),
  Unrelated = 10^rnorm(100, mean = -0.5, sd = 1)
)

cat("Summary of log10(LR) under H1 (Related):\n")
#> Summary of log10(LR) under H1 (Related):
summary(log10(genetic_df$Related))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  -1.490   2.075   3.135   3.049   3.992   6.430

cat("\nSummary of log10(LR) under H2 (Unrelated):\n")
#> 
#> Summary of log10(LR) under H2 (Unrelated):
summary(log10(genetic_df$Unrelated))
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#> -2.52468 -1.09150 -0.56929 -0.58748 -0.03821  2.20189

Step 2.3: Visualize Genetic LR Distributions

# Plot the LR distributions
plot_lr_distribution(genetic_df)

Part 3: Combining All Evidence

Step 3.1: Prior Probability

We need to specify a prior probability that a random POI is the MP. This depends on the size of the candidate population:

# If there are ~10,000 potential candidates
prior_prob <- 1/10000

# Convert to prior odds
prior_odds <- prior_prob / (1 - prior_prob)
cat("Prior probability:", prior_prob, "\n")
#> Prior probability: 1e-04
cat("Prior odds:", prior_odds, "\n")
#> Prior odds: 0.00010001

Step 3.2: Posterior Odds Calculation

The posterior odds combine all evidence:

\[\text{Posterior Odds} = \text{Prior Odds} \times LR_{genetic} \times LR_{nongenetic}\]

# For the simulations under H1 (true match scenario)
posterior_h1 <- prior_odds * genetic_df$Related * lr_nongenetic

# For simulations under H2 (no match scenario)
posterior_h2 <- prior_odds * genetic_df$Unrelated * lr_nongenetic

# Summary
cat("Posterior odds under H1 (median):", round(median(posterior_h1), 4), "\n")
#> Posterior odds under H1 (median): 9.8328
cat("Posterior odds under H2 (median):", round(median(posterior_h2), 6), "\n")
#> Posterior odds under H2 (median): 0.001945

Step 3.3: Decision Threshold Analysis

# Find optimal threshold with weight 10 (FP 10x worse than FN)
threshold_result <- decision_threshold(
  datasim = genetic_df,
  weight = 10
)
#> Decision threshold is: 2694.0341

# Calculate error rates at the optimal threshold
rates <- threshold_rates(
  datasim = genetic_df,
  threshold = threshold_result
)
#> FNR = 0.56 ;  FPR = 0 ;  MCC = 0.5311

# Check rates at different thresholds
cat("\nError rates at different thresholds:\n")
#> 
#> Error rates at different thresholds:
for (t in c(1, 10, 100, 1000)) {
  r <- threshold_rates(genetic_df, threshold = t)
  cat(sprintf("LR > %5d: FPR=%.3f, FNR=%.3f, MCC=%.3f\n",
              t, r$FPR, r$FNR, r$MCC))
}
#> FNR = 0.04 ;  FPR = 0.24 ;  MCC = 0.7348
#> LR >     1: FPR=0.240, FNR=0.040, MCC=0.735
#> FNR = 0.1 ;  FPR = 0.04 ;  MCC = 0.8616
#> LR >    10: FPR=0.040, FNR=0.100, MCC=0.862
#> FNR = 0.24 ;  FPR = 0.01 ;  MCC = 0.7707
#> LR >   100: FPR=0.010, FNR=0.240, MCC=0.771
#> FNR = 0.46 ;  FPR = 0 ;  MCC = 0.6082
#> LR >  1000: FPR=0.000, FNR=0.460, MCC=0.608

# Plot decision curve
plot_decision_curve(
  datasim = genetic_df,
  LRmax = 10000
)

Part 4: Interpretation

Strength of Evidence Categories

The LR quantifies how many times more likely the evidence is under H1 vs H2:

Log10(LR) LR Range Interpretation
< 0 < 1 Supports H2 (not the MP)
0-1 1-10 Weak support for H1
1-2 10-100 Support for H1
2-4 100-10,000 Strong support for H1
> 4 > 10,000 Very strong support for H1

Our Case Results

# Median genetic LR under H1
median_genetic_lr <- median(genetic_df$Related)

# Total LR
total_lr <- median_genetic_lr * lr_nongenetic
log10_total <- log10(total_lr)

cat("Genetic LR (median under H1):", round(median_genetic_lr, 0), "\n")
#> Genetic LR (median under H1): 1364
cat("Non-genetic LR:", round(lr_nongenetic, 2), "\n")
#> Non-genetic LR: 72.1
cat("Total combined LR:", round(total_lr, 0), "\n")
#> Total combined LR: 98318
cat("Log10(Total LR):", round(log10_total, 2), "\n")
#> Log10(Total LR): 4.99

Part 5: Interactive Exploration

For interactive exploration of parameters and their effects, use the Shiny applications:

# Basic CPT and LR visualization
app_mispitools()

# Advanced analysis with ROC curves
app_lr_comparison()

Conclusion

This workflow demonstrated how to:

  1. Calculate LRs for non-genetic evidence (sex, age, hair color)
  2. Simulate genetic LRs from pedigree-based relationships
  3. Combine evidence using the Bayesian framework
  4. Evaluate decision performance and error rates

The key advantage of combining evidence types is increased discrimination power, particularly useful in cases where genetic evidence alone may be inconclusive due to:

Session Information

sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8   
#>  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Argentina/Buenos_Aires
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] forrel_1.8.1     pedtools_2.9.0   mispitools_1.4.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyr_1.3.2        sandwich_3.1-1     sass_0.4.10        generics_0.1.4    
#>  [5] lpSolve_5.6.23     stringi_1.8.7      verbalisr_0.7.2    lattice_0.22-7    
#>  [9] pROC_1.19.0.1      digest_0.6.39      magrittr_2.0.4     RColorBrewer_1.1-3
#> [13] evaluate_1.0.5     grid_4.5.2         fastmap_1.2.0      plyr_1.8.9        
#> [17] jsonlite_2.0.0     Matrix_1.7-4       Formula_1.2-5      purrr_1.2.0       
#> [21] scales_1.4.0       pbapply_1.7-4      jquerylib_0.1.4    cli_3.6.5         
#> [25] rlang_1.1.6        miscTools_0.6-28   withr_3.0.2        cachem_1.1.0      
#> [29] yaml_2.3.12        otel_0.2.0         tools_4.5.2        parallel_4.5.2    
#> [33] reshape2_1.4.5     kinship2_1.9.6.2   dplyr_1.1.4        ggplot2_4.0.1     
#> [37] maxLik_1.5-2.2     vctrs_0.6.5        R6_2.6.1           zoo_1.8-15        
#> [41] lifecycle_1.0.4    stringr_1.6.0      pedprobr_1.0.1     pkgconfig_2.0.3   
#> [45] pillar_1.11.1      bslib_0.9.0        gtable_0.3.6       Rcpp_1.1.0        
#> [49] glue_1.8.0         pedmut_0.9.0       xfun_0.55          tibble_3.3.0      
#> [53] tidyselect_1.2.1   knitr_1.51         farver_2.1.2       DirichletReg_0.7-2
#> [57] patchwork_1.3.2    htmltools_0.5.9    labeling_0.4.3     rmarkdown_2.30    
#> [61] compiler_4.5.2     S7_0.2.1           ribd_1.7.1         quadprog_1.5-8

References

Marsico FL, Vigeland MD, Egeland T, Herrera Pinero F (2021). “Making decisions in missing person identification cases with low statistical power.” Forensic Science International: Genetics, 52, 102519.

Marsico FL, et al. (2023). “Likelihood ratios for non-genetic evidence in missing person cases.” Forensic Science International: Genetics, 66, 102891.