Modelling joint distribution of three hydrologic variables

Floof peak, volume and duration on the Parná river

Author

Tomáš Bacigál

Published

October 29, 2023

1 Initial

1.1 Libraries and settings

library(ggplot2)   # grammar of graphics package
library(patchwork)  # combining ggplots
theme_set(theme_bw())  # ggplot white background theme

1.2 Data import

dat <- read.table("data/HO_summer.csv", 
                  header = TRUE, sep = ";", dec = ".", check.names = FALSE) |>
  dplyr::transmute(index = Poradie,
                   Qmax = `Kulminacny prietok [m3/s]`,
                   Volume = `Objem vlny [mil. m3]`,
                   Duration = `Trvanie povodne [h]`/24)
str(dat)
'data.frame':   33 obs. of  4 variables:
 $ index   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Qmax    : num  1.393 0.559 0.67 1.111 0.877 ...
 $ Volume  : num  0.1787 0.0456 0.1145 0.2481 0.0526 ...
 $ Duration: num  4.38 3.04 3.92 5.33 3.46 ...
dat_long <- dat |> 
  tidyr::pivot_longer(cols = - index, names_to = "variable", values_to = "value") |>  
  dplyr::mutate(variable = forcats::as_factor(variable))
str(dat_long)
tibble [99 × 3] (S3: tbl_df/tbl/data.frame)
 $ index   : int [1:99] 1 1 1 2 2 2 3 3 3 4 ...
 $ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 2 3 1 2 3 1 2 3 1 ...
 $ value   : num [1:99] 1.393 0.1787 4.375 0.559 0.0456 ...

2 Marginal distribution

2.1 Raw observations

dat_long |> 
  dplyr::mutate(variable = dplyr::recode(variable, 
                                         Qmax = "Qmax [m3/s]",
                                         Volume = "Volume [mil m3]",
                                         Duration = "Duration [day]")
  ) |> 
  ggplot() + aes(x = index, y = value) + 
  geom_point() + 
  facet_grid(rows = vars(variable), scales = "free_y") +
  ylab("")

2.2 Estimation

2.2.1 Helper functions and objects

One dimensional grid for visualization

# grid in wide format would be confusing 
# because of placing unrelated values in one row
# grid1 <- dat |> 
#   dplyr::reframe(across(-index, ~ modelr::seq_range(.x, n = 20)))

grid1_long <- dat_long |> 
  dplyr::reframe(value = modelr::seq_range(value, n = 50), 
                 .by = variable)
str(grid1_long)
tibble [150 × 2] (S3: tbl_df/tbl/data.frame)
 $ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ value   : num [1:150] 0.089 0.778 1.467 2.157 2.846 ...

Fit univariate distribution by maximum-likelihood (ML) method. Starting values of parameters are refined by least-squares (LS) method.

# arguments:
# x - data to fit
# pdf, cdf - functions with data as first argument
# start - valid parameters to start with
fit_distr <- function(x, pdf, cdf, start, ...) {
  Ecdf <- rank(x) / (length(x)+1)
  fun <- function(pars) {
    sum((Ecdf - cdf(x, !!!pars))^2) |> rlang::inject()  # or do.call
  }
  fit <- optim(par = start, fn = fun, ...)
  fun <- function(pars) {
    - sum(log(pdf(x, !!!pars))) |> rlang::inject()
  }
  out <-  try(optim(par = (9*fit$par + 1*start)/10, fn = fun, ...), silent = TRUE)
  # the convex combination lowers the possibility of failure on boundary
  if(inherits(out, "try-error")) { 
    out <- fit |> c(type = "LS")
    } else {
      out <- out |> c(type = "ML")
    }
  out
}

2.2.2 Empirical

Empirical density (histogram)

dat_long |>
  ggplot() + aes(x = value) + 
  geom_histogram(bins = 10, color = "white") + 
  facet_grid(cols = vars(variable), scales = "free_x") +
  xlab("")

Empirical CDF (standardized ranks, plotting position)

dat_Memp_long <- dat_long |> 
  dplyr::mutate(cdf = rank(value) / (dplyr::n() + 1), .by = variable)

dat_Memp_long |> 
  ggplot() + aes(x = value, y = cdf) + 
  geom_step() + 
  facet_grid(cols = vars(variable), scales = "free_x") +
  xlab("")

Pseudo-observations (CDF transformed data)

dat_Memp <- dat_Memp_long |> 
  tidyr::pivot_wider(id_cols = c(index), names_from = variable, values_from = cdf)

2.2.3 Kernel smoothing

It is necessary to install probhat package and its dependencies (1.) kubik, (2.) barsurf.

# nested list (variable / function)
funs_Mks <- dat |> 
  dplyr::select(-index) |> 
  purrr::map(.f = \(x) purrr::map(
    .x = c(cdf = "cdfuv.cks", pdf = "pdfuv.cks", qua = "qfuv.cks"), 
    .f = \(y) do.call(y, list(x = x, a = 0), envir = asNamespace("probhat"))
    ))

dat_Mks_long <- purrr::map(
  .x = unique(dat_long$variable),
  .f = \(x) {
    dat_long |> 
      dplyr::filter(variable == x) |> 
      dplyr::mutate(pdf = funs_Mks[[x]]$pdf(value),
                    cdf = funs_Mks[[x]]$cdf(value))
  }) |> 
  dplyr::bind_rows()

grid1_Mks_long <- purrr::map(
  .x = unique(dat_long$variable),
  .f = \(x) {
    grid1_long |> 
      dplyr::filter(variable == x) |> 
      dplyr::mutate(pdf = funs_Mks[[x]]$pdf(value),
                    cdf = funs_Mks[[x]]$cdf(value))
  }) |> 
  dplyr::bind_rows()
str(grid1_Mks_long)
tibble [150 × 4] (S3: tbl_df/tbl/data.frame)
 $ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ value   : num [1:150] 0.089 0.778 1.467 2.157 2.846 ...
 $ pdf     : num [1:150] 0.198 0.251 0.252 0.193 0.133 ...
 $ cdf     : num [1:150] 0.0216 0.186 0.3461 0.5069 0.6207 ...
ggplot() + aes(x = value) +
  geom_histogram(aes(y = after_stat(density)), data = dat_long, 
                 bins = 10, color = "white", fill = "grey") +
  #geom_density(data = dat_long, bounds = c(0, Inf)) +
  geom_line(aes(y = pdf), data = grid1_Mks_long) +
  facet_wrap(vars(variable), scales = "free")

ggplot() + aes(x = value, y = cdf) +
  geom_step(data = dat_Memp_long, color = "grey") +
  geom_line(data = grid1_Mks_long) +
  facet_wrap(vars(variable), scales = "free_x") +
  xlab("")

2.2.4 Parametric

2.2.4.1 L-moments

Parametric families of univariate distributions from package lmomco.

family_name_lmomco <- c("exp", "gam", "gev", "gld", "glo", "gno", "gov", "gpa", "gum", "kap", "lap", "ln3", "nor", "pe3", "ray", "revgum", "rice", "wak", "wei")

# nested list (variable / family / function)
funs_Mpar_lmo <- dat |> 
  dplyr::select(-index) |> 
  lapply(FUN = function(data) {
    lmoms <- lmomco::lmoms(data)
    lapply(family_name_lmomco |> (\(x) setNames(x,x))(),  # to pass names
           FUN = function(y) {
      exceptions <- c("ln3", "kap", "gpa")
      # tryCatch(  # helps to identify family for which an error occurred  # - start
      if(y %in% exceptions) {
        if(y == "ln3") pars <- lmomco::lmom2par(lmoms, type = y, zeta = 0)
        if(y == "kap") pars <- lmomco::lmom2par(lmoms, type = y, snap.tau4=TRUE)
        if(y == "gpa") pars <- lmomco::lmom2par(lmoms, type = y, xi = 0)
      } else {
        pars <- lmomco::lmom2par(lmoms, type = y)
      }
      #, error = function(e) { cat("with family", y,":\n"); print(e) })  # - end
      list(
        pdf = function(x) lmomco::par2pdf(x, para = pars),
        cdf = function(q) lmomco::par2cdf(q, para = pars),
        qua = function(p) lmomco::par2qua(p, para = pars)
      )
    })
  })

2.2.4.2 Maximum likelihood

User-defined parametric family: the Johnson bounded distribution (type SB) introduced by Johnson (1949):

# conditions: xi < x < xi + lambda 
 
# underlying conversion function between Johnson and N(0,1) distribution
convert_Joh_norm <- function(x, gamma, delta, xi, lambda, to = c("norm", "Joh")) {
  g <- function(x, a, b) (x-a)/b  # standardization
  f <- function(u) u/(1-u)  # SB specific
  fi <- function(u) u/(1+u)  # inverse to f
  out <- switch(to[1],
                norm = gamma + delta*log(f(g(x,xi,lambda))),
                Joh = xi + lambda*fi(exp(g(x,gamma,delta)))
  )
  out
}
# probability density function, PDF
dJohSB <- function(x, gamma, delta, xi, lambda) {
  suppressWarnings(
    z <- convert_Joh_norm(x, gamma, delta, xi, lambda)
    )  # suppress 'NaNs produced' warning
  # pdfJ(x) = pdfN01(z) * |dz(x)/dx|
  out <- dnorm(z) * delta * lambda / (x-xi) / (xi + lambda - x) 
  out[x <= xi | x >= xi + lambda] <- 0
  out
}
# cumulative distribution function, CDF
pJohSB <- function(q, gamma, delta, xi, lambda) {
  suppressWarnings(
    z <- convert_Joh_norm(q, gamma, delta, xi, lambda)
    )  # suppress 'NaNs produced' warning
  out <- pnorm(z)
  out[q <= xi] <- 0
  out[q >= xi + lambda] <- 1
  out
}
# quantile function
qJohSB <- function(p, gamma, delta, xi, lambda) {
  out <- p |> 
    qnorm() |> 
    convert_Joh_norm(gamma, delta, xi, lambda, to = "Joh")
  out[is.nan(out)] <- xi + lambda
  out
}
# random generator
rJohSB <- function(n, gamma, delta, xi, lambda) {
  rnorm(n) |> 
    convert_Joh_norm(gamma, delta, xi, lambda, to = "Joh")
}
# Function to calculate ML estimates with constraints: lim[1] <= xi < min(x) & max(x) < xi + lambda <= lim[2]
# Standard errors are optionally returned as "se" attribute.
# TO DELETE:
# eJohSB <- function(x, eps = NULL, se = FALSE) {
#   ran <- range(x)
#   if(is.null(eps)) eps <- 0.001 * diff(ran)  # stabilize MLE
#   pdf <- function(x, gamma, delta, xi, xila) {
#       dJohSB(x, gamma, delta, xi, xila - xi)
#   }
#   cdf <- function(x, gamma, delta, xi, xila) {
#       pJohSB(x, gamma, delta, xi, xila - xi)
#   }
#   fit <- fit_distr(
#     x, pdf, cdf, 
#     method = "L-BFGS-B", 
#     start = c(gamma = 0.2, delta = 0.5, xi = ran[1]*0.9, xila = ran[2]*1.1),
#     lower = c(gamma = -Inf, delta = 0, xi = 0, xila = ran[2] + eps),  
#     upper = c(gamma = Inf, delta = Inf, xi = ran[1] - eps, xila = Inf),
#     hessian = TRUE,
#     control = list(maxit = 500)
#     )
#   lambda <- unname(fit$par["xila"] - fit$par["xi"])
#   out <- c(fit$par[-4], lambda = lambda)
#   if(isTRUE(se)) {
#     vcov <- solve(fit$hessian)
#     var_lambda <- unname(vcov["xi","xi"] + vcov["xila","xila"] + 2*vcov["xi","xila"])
#     attr(out, "se") <- sqrt(c(diag(vcov)[-4], lambda = var_lambda))
#   }
#   out
# } 

# in:
#  x - numeric vector to be fitted
#  lim - outer limits for values of the fitted random variable
#  eps - a positive real number close to zero, tuning parameter 
#  se - logical, whether standard errors should be computed
# out: named vector of estimated parameters with optional attribute 'se'
eJohSB <- function(x, lim = c(0,Inf), eps = NULL, se = FALSE) {
  ran <- range(x)
  if(lim[1]>=ran[1] | lim[2]<=ran[2]) stop("Argument 'lim' is within the range of 'x'!")
  if(is.null(eps)) eps <- 0.001 * diff(ran)  # stabilize MLE
  pdf <- function(x, gamma, delta, xi, xila) {
    dJohSB(x, gamma, delta, xi, xila - xi)
  }
  cdf <- function(x, gamma, delta, xi, xila) {
    pJohSB(x, gamma, delta, xi, xila - xi)
  }
  start_xi <- max((lim[1]+ran[1])/2, ran[1]*0.9)  # in case lim contains infinity
  start_xila <- min((ran[2]+lim[2])/2, ran[2]*1.1)
  up_xi <- max((lim[1]+3*ran[1])/4, ran[1]-eps) # in case eps is too big
  lo_xila <- min((3*ran[2]+lim[2])/4, ran[2]+eps)
  fit <- fit_distr(
    x, pdf, cdf, 
    method = "L-BFGS-B", 
    start = c(gamma = 0.2, delta = 0.5, 
              xi = start_xi, xila = start_xila),
    lower = c(gamma = -Inf, delta = 0, xi = lim[1], xila = lo_xila),  
    upper = c(gamma = Inf, delta = Inf, xi = up_xi, xila = lim[2]),
    hessian = se,
    control = list(maxit = 500)
  )
  lambda <- unname(fit$par["xila"] - fit$par["xi"])
  out <- c(fit$par[-4], lambda = lambda) |> 
    `attr<-`("method", fit$type)
  if(isTRUE(se)) {
    vcov <- solve(fit$hessian)
    var_lambda <- unname(vcov["xi","xi"] + vcov["xila","xila"] + 2*vcov["xi","xila"])
    attr(out, "se") <- sqrt(c(diag(vcov)[-4], lambda = var_lambda))
  }
  out
} 

# example use:
# pJohSB(dat$Duration, gamma = 0.1, delta = 0.5, xi = 2, lambda = 6)
# or with rlang injection:
# tmp <- list(gamma = 0.1, delta = 0.5, xi = 2, lambda = 6) 
# pJohSB(dat$Duration, !!!tmp) |> rlang::inject()
# dJohSB(dat$Duration, !!!tmp) |> rlang::inject()
# qJohSB(c(0,0.5,1), !!!tmp) |> rlang::inject()
# rJohSB(10, !!!tmp) |> rlang::inject()
# eJohSB(dat$Volume)

Estimation

funs_Mpar_mle <- dat |> 
  dplyr::select(-index) |> 
  lapply(FUN = function(data) {
    pars <- as.list(eJohSB(data))
    list(joh = list(
      pdf = function(x) dJohSB(x, !!!pars) |> rlang::inject(),
      cdf = function(q) pJohSB(q, !!!pars) |> rlang::inject(),
      qua = function(p) qJohSB(p, !!!pars) |> rlang::inject()
    ))
  })

2.2.4.3 Combine

Choose what to keep!

funs_Mpar <- mapply(FUN = c, 
                   funs_Mpar_lmo[], 
                   funs_Mpar_mle[], 
                   SIMPLIFY = FALSE)

Calculation in grid

grid1_Mpar_long <- purrr::imap(
  .x = funs_Mpar,
  .f = function(x, i) {  # variable layer
    purrr::imap(
      .x = x,
      .f = function(y, j) {  # family layer
        df <- dplyr::filter(grid1_long, variable == i)
        data.frame(variable = i, 
                   family = j,
                   value = df$value,
                   pdf = funs_Mpar[[i]][[j]]$pdf(df$value),
                   cdf = funs_Mpar[[i]][[j]]$cdf(df$value)
                   )
      }
    ) |> dplyr::bind_rows()
  }
) |> dplyr::bind_rows() |> 
  dplyr::mutate(variable = forcats::as_factor(variable))

Calculation on observed data

dat_Mpar_long <- purrr::imap(
  .x = funs_Mpar,
  .f = function(x, i) {  # variable layer
    purrr::imap(
      .x = x,
      .f = function(y, j) {  # family layer
        df <- dat_long |> 
          dplyr::filter(variable == i) |> 
          dplyr::mutate(
            family = j,
            pdf = funs_Mpar[[i]][[j]]$pdf(value),
            cdf = funs_Mpar[[i]][[j]]$cdf(value)
          )
        #data.frame(index = 
        #           variable = i, 
        #           family = j,
        #           value = df$value,
        #           pdf = funs_Mpar[[i]][[j]]$pdf(df$value),
        #           cdf = funs_Mpar[[i]][[j]]$cdf(df$value)
        #           )
      }
    ) |> dplyr::bind_rows()
  }
) |> dplyr::bind_rows() |> 
  dplyr::mutate(variable = forcats::as_factor(variable))

2.2.5 Unite model functions

funs_Mall <- mapply(FUN = c, 
                   list(ks = funs_Mks) |> purrr::transpose(), 
                   funs_Mpar, 
                   SIMPLIFY = FALSE)

2.3 Decision

2.3.1 GoF comparison

2.3.1.1 Visual

p <- ggplot() + aes(x = value, y = cdf) +
  geom_step(data = dat_Memp_long) +
  geom_line(data = grid1_Mks_long) +
  geom_line(data = grid1_Mpar_long, aes(color = family)) +
  facet_wrap(vars(variable), ncol = 1, scales = "free_x") +
  xlab("")
plotly::ggplotly(p, height = 800)

A note to the above interactive plot: To deselect all but one colored curves, double-click the legend entry of your choice. To show all, reload the page.

2.3.1.2 Numeric

gof_marg <- dat_Mks_long |> 
  dplyr::mutate(family = "ks") |> 
  rbind(dat_Mpar_long) |> 
  dplyr::rename(model = family) |> 
  dplyr::select(-pdf) |> 
  dplyr::full_join(dat_Memp_long, 
                   by = dplyr::join_by(index, variable, value),
                   suffix = c(".par", ".emp")
  ) |> 
  dplyr::summarise(MSE = mean((cdf.emp - cdf.par)^2), .by = c(variable, model)) |> 
  dplyr::arrange(variable, MSE) |> 
  dplyr::mutate(model = forcats::as_factor(model))

gof_marg |> dplyr::slice_head(n = 3, by = variable)
# A tibble: 9 × 3
  variable model      MSE
  <fct>    <fct>    <dbl>
1 Qmax     gno   0.000894
2 Qmax     kap   0.00111 
3 Qmax     wak   0.00111 
4 Volume   joh   0.000688
5 Volume   gld   0.000744
6 Volume   ln3   0.000784
7 Duration wak   0.000626
8 Duration ks    0.00102 
9 Duration gld   0.00134 
gof_marg |> 
  ggplot() + aes(x = model, y = MSE, color = variable) + geom_point() +
  #facet_wrap(facets = vars(variable), ncol = 1) +
  theme(legend.position = c(0.2,0.8))

2.3.2 Selection

Choose what marginals model to continue with!

marg_id <- c(Qmax = "gev", Volume = "wei", Duration = "joh")

Model functions in one object

marg_fu <- marg_id |> 
  purrr::imap(.f = function(x, i) {
    funs_Mall[[i]][[x]]
  })

Last check of GoF

marg_fu |> 
  purrr::imap(.f = function(x, i) {
    grid1_long |> 
      dplyr::filter(variable == i) |> 
      dplyr::mutate(cdf = x$cdf(value))
  }) |> 
  purrr::list_rbind() |>  # more-or-less the same as dplyr::bind_rows
  ggplot() + aes(x = value, y = cdf) + 
  geom_step(data = dplyr::mutate(dat_Memp_long, data = "empirical")) +
  geom_line(color = 4) + 
  facet_wrap(vars(variable), nrow = 1, scales = "free_x") +
  xlab("")

Kolmogorov-Smirnov, Cramer-von Mises and Anderson-Darling GoF test

marg_fu |> 
  purrr::imap(.f = function(x, i) {
    dat_long |> 
      dplyr::filter(variable == i) |> 
      dplyr::summarize(
        KS = ks.test(value, y = x$cdf)$p.value,
        CvM = goftest::cvm.test(value, null = x$cdf)$p.value, 
        AD = goftest::ad.test(value, null = x$cdf)$p.value
      ) |> 
      round(4)
  }) |> 
  purrr::list_rbind(names_to = "variable")
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `KS = ks.test(value, y = x$cdf)$p.value`.
Caused by warning in `ks.test.default()`:
! ties should not be present for the Kolmogorov-Smirnov test
# A tibble: 3 × 4
  variable    KS   CvM    AD
  <chr>    <dbl> <dbl> <dbl>
1 Qmax     0.924 0.845 0.853
2 Volume   0.950 0.926 0    
3 Duration 0.36  0.352 0.310

Strange behaviour of Anderson-Darling test. Subject to check.

3 Dependence

vars_comb <- levels(dat_long$variable) |> 
  combn(m = 2) |>  
  t() |> data.frame() 

3.1 Single measure

Correlation coefficients (cor) and test of significance p-value (pval):

  • Pearson’s (pea),
  • Spearman’s (spe).
cor_test_comb <- vars_comb |> 
  dplyr::summarize(
    pea = cor.test(dat[[X1]], dat[[X2]])[c("estimate","p.value")] |> 
      unlist() |> setNames(c("cor","pval")) |> as.list() |> data.frame(),
    spe = cor.test(dat[[X1]], dat[[X2]], method = "spearman", exact = FALSE) |> 
      (`[`)(c("estimate","p.value")) |> 
      unlist() |> setNames(c("cor","pval")) |> as.list() |> data.frame(),
    .by = c(X1,X2)
  ) |> 
  tidyr::unnest(cols = c(pea, spe), names_sep = ".") |> 
  dplyr::mutate(across(-c(X1,X2), ~round(.x, 4)))

cor_test_comb |> as.data.frame()
      X1       X2 pea.cor pea.pval spe.cor spe.pval
1   Qmax   Volume  0.7616   0.0000  0.8643   0.0000
2   Qmax Duration  0.1309   0.4679  0.4108   0.0175
3 Volume Duration  0.5545   0.0008  0.7234   0.0000

Partial correlation matrix

dat[,-1] |> 
  cov() |> 
  solve() |> 
  cov2cor() |> 
  (`*`)(2*diag(1,ncol(dat[,-1]),ncol(dat[,-1]))-1) |> 
  round(2)
          Qmax Volume Duration
Qmax      1.00   0.84    -0.54
Volume    0.84   1.00     0.71
Duration -0.54   0.71     1.00
dat[-1] |> 
  cov() |>
  qgraph::qgraph(edge.labels=TRUE, graph="pcor", 
                 edge.label.cex=2, edge.label.margin = 0.05, label.cex=1.3)

3.2 Visual exploration

Observed data and Pearson correlation coefficient (with p-value of its significance test)

cor_test_comb |> 
  dplyr::select(X1, X2, pea.cor, pea.pval) |>
  purrr::pwalk(.f = function(X1, X2, pea.cor, pea.pval) {
    p <- ggplot(data = dat) + 
      aes(x = .data[[X1]], y = .data[[X2]]) + 
      geom_point() +
      geom_text(data = data.frame(), 
                aes(label = paste0(round(pea.cor, 2), " (", round(pea.pval, 3), ")")), 
                x=Inf, y=-Inf, hjust = 1.5, vjust = -2, size=3)
    print(p)
  })

Observations transformed by empirical CDF and Spearman correlation coefficient (with p-value of its significance test) displayed in the corner

cor_test_comb |> 
  dplyr::select(X1, X2, spe.cor, spe.pval) |>
  purrr::pwalk(.f = function(X1, X2, spe.cor, spe.pval) {
    p <- ggplot(data = dat_Memp) + 
      aes(x = .data[[X1]], y = .data[[X2]]) + 
      geom_point() +
      geom_text(data = data.frame(), 
                aes(label = paste0(round(spe.cor, 2), " (", round(spe.pval, 3), ")")), 
                x = Inf, y = -Inf, hjust = 1.2, vjust = -1.5, size = 3)
    print(p)
  })  

Pseudo-observations with standard normal marginals and density level curves (to be compared with bivariate normal density which is elliptically contoured)

dat_Memp_qnorm <- dat_Memp |> 
  dplyr::mutate(across(-index, qnorm))
vars_comb |> 
  purrr::pwalk(.f = function(X1, X2) {
    p <- ggplot(data = dat_Memp_qnorm) + 
      aes(x = .data[[X1]], y = .data[[X2]]) + 
      geom_point() + 
      geom_density2d(adjust = c(1.3,1.3))
    print(p)
  })  

3.3 Copula

# dataset in matrix form and without indices
dat_mat <- dat[-1] |> as.matrix()
dat_Memp_mat <- dat_Memp[,-1] |> as.matrix()

3.3.1 Nonparametric

Empirical copula with no or specified smoothing. See Hofert et al. (2018), chapter 4.2 for details on smoothing methods (and their meaning for small samples) and chapter 6.1 for discussion about dealing with ties (effect on estimation).

funs_Cnp <- c(emp = "none", chb = "checkerboard", beta = "beta") |> 
  lapply(FUN = function(type) { 
    obj <- copula::empCopula(dat_Memp_mat, ties.method = "average", smoothing = type)
    list(  # return
      obj = obj,
      pdf = function(x) copula::dCopula(x, copula = obj),
      cdf = function(x) copula::pCopula(x, copula = obj),
      ran = function(n) copula::rCopula(n, copula = obj) |>
        (`colnames<-`)(colnames(dat_Memp_mat))  
        # because beta fails to pass colnames (copula version 1.1-2)
    )
  }
)

# examples:
# funs_Cnp$emp$cdf(rep(0.5,3))
# funs_Cnp$chb$cdf(rep(0.5,3))
# funs_Cnp$beta$cdf(rep(0.5,3))
# results: 0.3636364, 0.3333333, 0.2990709

3.3.2 Elliptical

(Correct name: copulas of elliptically contoured distributions.)

Estimation

funs_Cpar_ell <- c(nor = "normal", tco = "t") |> 
  lapply(FUN = function(fam) {
    obj <- copula::ellipCopula(   # initial constructor
      fam, 
      # param = cor(dat_mat) |> (\(x) x[upper.tri(x)])(),  # not necessary
      dim = ncol(dat_mat),
      dispstr = "un"
      ) |> 
      copula::fitCopula(data = dat_Memp_mat) |>  # estimation 
      (`@`)("copula")  # copula object extraction
    if(fam == "t") {  # round df, otherwise pCopula fails
      df_ind <- length(obj@parameters)  # index of df (last parameter)
      obj@parameters[df_ind] <- obj@parameters[df_ind] |> round() |> max(1)
    }
    list(  # return
      obj = obj,
      pdf = function(x) copula::dCopula(x, copula = obj),
      cdf = function(u) copula::pCopula(u, copula = obj),
      ran = function(n) copula::rCopula(n, copula = obj) |>
        (`colnames<-`)(colnames(dat_mat))
    )
  })
Warning in var.mpl(copula, u): the covariance matrix of the parameter estimates
is computed as if 'df.fixed = TRUE' with df = 7.11485542432891
# examples:
# funs_Cpar_ell$nor$pdf(c(0.5,0.5,0.5))  # result: 4.268227
# funs_Cpar_ell$tco$cdf(c(0.5,0.5,0.5))  # result: 0.3175686

3.3.3 Vine

Specify admissible families and particular vine structure via R-vine matrix!

families_vine <- NA  # or 0:6 for standard 1-parameter bicopulas
structures_vine <- list(
  vi1 = rbind(  # 1st variable as root
  c(2, 0, 0),
  c(3, 3, 0),
  c(1, 1, 1)
),
  vi2 = rbind(  # 2nd variable as root
  c(1, 0, 0),
  c(3, 3, 0),
  c(2, 2, 2)
),
  vi3 = rbind(  # 3rd variable as root
  c(2, 0, 0),
  c(1, 1, 0),
  c(3, 3, 3)
)
)

Estimation

# first, define vine objects
funs_Cpar_vine <- structures_vine |> 
  lapply(function(rvm) {
    list(obj = VineCopula::RVineCopSelect(
      data = dat_Memp_mat,
      familyset = families_vine, 
      Matrix = rvm, 
      rotations = TRUE
    )) 
  })

# funs_Cpar_vine <- list()   # uncomment for automatic structure selection only
funs_Cpar_vine$vin <- list(obj = VineCopula::RVineStructureSelect(   # auto
  dat_Memp_mat, 
  familyset = families_vine, 
  indeptest = TRUE
))
funs_Cpar_vine$vin$obj$Matrix
     [,1] [,2] [,3]
[1,]    1    0    0
[2,]    3    2    0
[3,]    2    3    3
# second, make functions (CDF via smoothed empirical copula of simulated data)
funs_Cpar_vine <- funs_Cpar_vine |> 
  lapply(function(vine) {
    ran <- function(n) VineCopula::RVineSim(n, RVM = vine$obj)
    simdata <- ran(nrow(dat)*1000)
    list(  # return
      obj = vine$obj,
      pdf = function(x) VineCopula::RVinePDF(x, RVM = vine$obj),
      cdf = function(u) copula::C.n(u, X = simdata, smoothing = "none"),
      ran = ran
    )
  })

# examples:
# funs_Cpar_vine$vin$pdf(c(0.5,0.5,0.5))  # result: 5.367162
# funs_Cpar_vine$vin$cdf(c(0.5,0.5,0.5))  # result: 0.319...

# funs_Cpar_vine$vin$ran(2)
# result (random): 
#           Qmax    Volume  Duration
# [1,] 0.6301589 0.5407897 0.2184795
# [2,] 0.4032000 0.2401318 0.1016480

# sapply(funs_Cpar_vine, function(x) x$pdf(c(0.5,0.5,0.5)))
# result: 
#      vi1      vi2      vi3      vin 
# 5.396515 5.367162 4.410513 5.367162 
# obviously the 2nd structure is also the choice of automatic selection

3.3.4 GoF comparison

Gather models to single list

funs_Call <- c(funs_Cnp[c("chb", "beta")], 
               funs_Cpar_ell, 
               funs_Cpar_vine
               )

3.3.4.1 Numerical

Goodness-of-fit via MSE to empirical copula

local({
  cemp <- copula::F.n(x = dat_Memp_mat, X = dat_Memp_mat)
  sapply(funs_Call, function(cpar) {
    mean((cemp - cpar$cdf(dat_Memp_mat))^2)
  })
}) |> 
  signif(3) |> 
  sort()
     chb      vi2      tco      vi1      vi3      vin      nor     beta 
0.000810 0.000859 0.000861 0.000869 0.000907 0.000914 0.001040 0.001500 

One should be careful with performance of checkerboard copula since it arose from empirical copula and the same it is compared to.

3.3.4.2 Visual

Kendall’s distribution function. The lower black, straight line corresponds to co-monotonicity copula, the upper black curve corresponds to independence. The stairs represent empirical version.

funs_Call |> 
  lapply(function(cop) {
    sim <- cop$ran(1000)
    data.frame(z = copula::F.n(sim, sim)) |> 
      dplyr::mutate(cdf = rank(z)/(length(z)))
  }) |> 
  dplyr::bind_rows(.id = "copula") |> 
  dplyr::mutate(copula = forcats::as_factor(copula)) |> 
  dplyr::arrange(copula, z) |> 
  ggplot() + aes(x = z, y = cdf) +
  geom_step(data = dplyr::mutate(data.frame(z = funs_Cnp$emp$cdf(dat_Memp_mat)),
                                 cdf = rank(z)/length(z))
            ) +
  geom_line(aes(color = copula)) +
  geom_line(data = dplyr::mutate(data.frame(z = seq(0.01,1,by=0.01)), 
                                 cdf = z * (1 - log(z) + log(z)^2/2))
            ) +
  geom_line(data = data.frame(z = c(0,1), cdf = c(0,1)))

3.3.5 Selection

Choose a copula from the fitted!

copu_id <- "nor"
copu_fu <- funs_Call[[copu_id]]

Pairwise density contour plots of simulations from distribution with selected copula and standard normal margins.

sim_Csel_qnorm <- copu_fu$ran(10000) |> 
  data.frame() |> 
  dplyr::mutate(across(everything(), qnorm))
vars_comb |> 
  purrr::pwalk(.f = function(X1, X2) {
    p <- ggplot(data = sim_Csel_qnorm) + 
      aes(x = .data[[X1]], y = .data[[X2]]) + 
      geom_point(col = "grey") + 
      geom_point(data = dat_Memp_qnorm) +
      geom_density2d(adjust = c(1.5,1.5))
    print(p)
  }) 

Class specific visuals

if(stringr::str_starts(copu_id, "vi")) {  # vine copulas
  contour(copu_fu$obj)
  set.seed(2)
  plot(copu_fu$obj, type = 1, edge.labels = "family-tau", label.pad = 0.1)
  names(dat)[-1] |> setNames(1:3)
}

4 Joint distribution

Model functions

join_fu <- list(
  cdf = function(q) {
    q <- as.data.frame(q)
    u <- mapply(function(marg,vec) marg$cdf(vec), marg_fu, q)  # C(F1(x1),...)
    copu_fu$cdf(u)
  },
  pdf = function(x) {
    x <- as.data.frame(x)
    u <- mapply(function(marg,vec) marg$cdf(vec), marg_fu, x) |> rbind()
    pdfx <- mapply(function(marg,vec) marg$pdf(vec), marg_fu, x) |> rbind()
    copu_fu$pdf(u) * apply(pdfx, 1, prod)  # c(F1(x1),...)*f1(x1)*f2(x2)*f3(x3)
  },
  ran = function(n) {
    sim <- copu_fu$ran(n)  # simulate from copula, then turn to quantiles
    mapply(function(marg,vec) marg$qua(vec), marg_fu, as.data.frame(sim))
  }
)

—— UNDER CONSTRUCTION ——

References

Hofert, Marius, Ivan Kojadinovic, Martin Mächler, and Jun Yan. 2018. Elements of Copula Modeling with R. Springer International Publishing.
Johnson, N.L. 1949. “Systems of Frequency Curves Generated by Methods of Translation.” Biometrica, no. 36: 149–76.