library(ggplot2) # grammar of graphics package
library(patchwork) # combining ggplots
theme_set(theme_bw()) # ggplot white background themeModelling joint distribution of three hydrologic variables
Floof peak, volume and duration on the Parná river
1 Initial
1.1 Libraries and settings
1.2 Data import
dat <- read.table("data/HO_summer.csv",
header = TRUE, sep = ";", dec = ".", check.names = FALSE) |>
dplyr::transmute(index = Poradie,
Qmax = `Kulminacny prietok [m3/s]`,
Volume = `Objem vlny [mil. m3]`,
Duration = `Trvanie povodne [h]`/24)
str(dat)'data.frame': 33 obs. of 4 variables:
$ index : int 1 2 3 4 5 6 7 8 9 10 ...
$ Qmax : num 1.393 0.559 0.67 1.111 0.877 ...
$ Volume : num 0.1787 0.0456 0.1145 0.2481 0.0526 ...
$ Duration: num 4.38 3.04 3.92 5.33 3.46 ...
dat_long <- dat |>
tidyr::pivot_longer(cols = - index, names_to = "variable", values_to = "value") |>
dplyr::mutate(variable = forcats::as_factor(variable))
str(dat_long)tibble [99 × 3] (S3: tbl_df/tbl/data.frame)
$ index : int [1:99] 1 1 1 2 2 2 3 3 3 4 ...
$ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 2 3 1 2 3 1 2 3 1 ...
$ value : num [1:99] 1.393 0.1787 4.375 0.559 0.0456 ...
2 Marginal distribution
2.1 Raw observations
dat_long |>
dplyr::mutate(variable = dplyr::recode(variable,
Qmax = "Qmax [m3/s]",
Volume = "Volume [mil m3]",
Duration = "Duration [day]")
) |>
ggplot() + aes(x = index, y = value) +
geom_point() +
facet_grid(rows = vars(variable), scales = "free_y") +
ylab("")2.2 Estimation
2.2.1 Helper functions and objects
One dimensional grid for visualization
# grid in wide format would be confusing
# because of placing unrelated values in one row
# grid1 <- dat |>
# dplyr::reframe(across(-index, ~ modelr::seq_range(.x, n = 20)))
grid1_long <- dat_long |>
dplyr::reframe(value = modelr::seq_range(value, n = 50),
.by = variable)
str(grid1_long)tibble [150 × 2] (S3: tbl_df/tbl/data.frame)
$ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 1 1 1 1 1 1 1 1 1 ...
$ value : num [1:150] 0.089 0.778 1.467 2.157 2.846 ...
Fit univariate distribution by maximum-likelihood (ML) method. Starting values of parameters are refined by least-squares (LS) method.
# arguments:
# x - data to fit
# pdf, cdf - functions with data as first argument
# start - valid parameters to start with
fit_distr <- function(x, pdf, cdf, start, ...) {
Ecdf <- rank(x) / (length(x)+1)
fun <- function(pars) {
sum((Ecdf - cdf(x, !!!pars))^2) |> rlang::inject() # or do.call
}
fit <- optim(par = start, fn = fun, ...)
fun <- function(pars) {
- sum(log(pdf(x, !!!pars))) |> rlang::inject()
}
out <- try(optim(par = (9*fit$par + 1*start)/10, fn = fun, ...), silent = TRUE)
# the convex combination lowers the possibility of failure on boundary
if(inherits(out, "try-error")) {
out <- fit |> c(type = "LS")
} else {
out <- out |> c(type = "ML")
}
out
}2.2.2 Empirical
Empirical density (histogram)
dat_long |>
ggplot() + aes(x = value) +
geom_histogram(bins = 10, color = "white") +
facet_grid(cols = vars(variable), scales = "free_x") +
xlab("")Empirical CDF (standardized ranks, plotting position)
dat_Memp_long <- dat_long |>
dplyr::mutate(cdf = rank(value) / (dplyr::n() + 1), .by = variable)
dat_Memp_long |>
ggplot() + aes(x = value, y = cdf) +
geom_step() +
facet_grid(cols = vars(variable), scales = "free_x") +
xlab("")Pseudo-observations (CDF transformed data)
dat_Memp <- dat_Memp_long |>
tidyr::pivot_wider(id_cols = c(index), names_from = variable, values_from = cdf)2.2.3 Kernel smoothing
It is necessary to install probhat package and its dependencies (1.) kubik, (2.) barsurf.
# nested list (variable / function)
funs_Mks <- dat |>
dplyr::select(-index) |>
purrr::map(.f = \(x) purrr::map(
.x = c(cdf = "cdfuv.cks", pdf = "pdfuv.cks", qua = "qfuv.cks"),
.f = \(y) do.call(y, list(x = x, a = 0), envir = asNamespace("probhat"))
))
dat_Mks_long <- purrr::map(
.x = unique(dat_long$variable),
.f = \(x) {
dat_long |>
dplyr::filter(variable == x) |>
dplyr::mutate(pdf = funs_Mks[[x]]$pdf(value),
cdf = funs_Mks[[x]]$cdf(value))
}) |>
dplyr::bind_rows()
grid1_Mks_long <- purrr::map(
.x = unique(dat_long$variable),
.f = \(x) {
grid1_long |>
dplyr::filter(variable == x) |>
dplyr::mutate(pdf = funs_Mks[[x]]$pdf(value),
cdf = funs_Mks[[x]]$cdf(value))
}) |>
dplyr::bind_rows()
str(grid1_Mks_long)tibble [150 × 4] (S3: tbl_df/tbl/data.frame)
$ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 1 1 1 1 1 1 1 1 1 ...
$ value : num [1:150] 0.089 0.778 1.467 2.157 2.846 ...
$ pdf : num [1:150] 0.198 0.251 0.252 0.193 0.133 ...
$ cdf : num [1:150] 0.0216 0.186 0.3461 0.5069 0.6207 ...
ggplot() + aes(x = value) +
geom_histogram(aes(y = after_stat(density)), data = dat_long,
bins = 10, color = "white", fill = "grey") +
#geom_density(data = dat_long, bounds = c(0, Inf)) +
geom_line(aes(y = pdf), data = grid1_Mks_long) +
facet_wrap(vars(variable), scales = "free")ggplot() + aes(x = value, y = cdf) +
geom_step(data = dat_Memp_long, color = "grey") +
geom_line(data = grid1_Mks_long) +
facet_wrap(vars(variable), scales = "free_x") +
xlab("")2.2.4 Parametric
2.2.4.1 L-moments
Parametric families of univariate distributions from package lmomco.
family_name_lmomco <- c("exp", "gam", "gev", "gld", "glo", "gno", "gov", "gpa", "gum", "kap", "lap", "ln3", "nor", "pe3", "ray", "revgum", "rice", "wak", "wei")
# nested list (variable / family / function)
funs_Mpar_lmo <- dat |>
dplyr::select(-index) |>
lapply(FUN = function(data) {
lmoms <- lmomco::lmoms(data)
lapply(family_name_lmomco |> (\(x) setNames(x,x))(), # to pass names
FUN = function(y) {
exceptions <- c("ln3", "kap", "gpa")
# tryCatch( # helps to identify family for which an error occurred # - start
if(y %in% exceptions) {
if(y == "ln3") pars <- lmomco::lmom2par(lmoms, type = y, zeta = 0)
if(y == "kap") pars <- lmomco::lmom2par(lmoms, type = y, snap.tau4=TRUE)
if(y == "gpa") pars <- lmomco::lmom2par(lmoms, type = y, xi = 0)
} else {
pars <- lmomco::lmom2par(lmoms, type = y)
}
#, error = function(e) { cat("with family", y,":\n"); print(e) }) # - end
list(
pdf = function(x) lmomco::par2pdf(x, para = pars),
cdf = function(q) lmomco::par2cdf(q, para = pars),
qua = function(p) lmomco::par2qua(p, para = pars)
)
})
})2.2.4.2 Maximum likelihood
User-defined parametric family: the Johnson bounded distribution (type SB) introduced by Johnson (1949):
# conditions: xi < x < xi + lambda
# underlying conversion function between Johnson and N(0,1) distribution
convert_Joh_norm <- function(x, gamma, delta, xi, lambda, to = c("norm", "Joh")) {
g <- function(x, a, b) (x-a)/b # standardization
f <- function(u) u/(1-u) # SB specific
fi <- function(u) u/(1+u) # inverse to f
out <- switch(to[1],
norm = gamma + delta*log(f(g(x,xi,lambda))),
Joh = xi + lambda*fi(exp(g(x,gamma,delta)))
)
out
}
# probability density function, PDF
dJohSB <- function(x, gamma, delta, xi, lambda) {
suppressWarnings(
z <- convert_Joh_norm(x, gamma, delta, xi, lambda)
) # suppress 'NaNs produced' warning
# pdfJ(x) = pdfN01(z) * |dz(x)/dx|
out <- dnorm(z) * delta * lambda / (x-xi) / (xi + lambda - x)
out[x <= xi | x >= xi + lambda] <- 0
out
}
# cumulative distribution function, CDF
pJohSB <- function(q, gamma, delta, xi, lambda) {
suppressWarnings(
z <- convert_Joh_norm(q, gamma, delta, xi, lambda)
) # suppress 'NaNs produced' warning
out <- pnorm(z)
out[q <= xi] <- 0
out[q >= xi + lambda] <- 1
out
}
# quantile function
qJohSB <- function(p, gamma, delta, xi, lambda) {
out <- p |>
qnorm() |>
convert_Joh_norm(gamma, delta, xi, lambda, to = "Joh")
out[is.nan(out)] <- xi + lambda
out
}
# random generator
rJohSB <- function(n, gamma, delta, xi, lambda) {
rnorm(n) |>
convert_Joh_norm(gamma, delta, xi, lambda, to = "Joh")
}
# Function to calculate ML estimates with constraints: lim[1] <= xi < min(x) & max(x) < xi + lambda <= lim[2]
# Standard errors are optionally returned as "se" attribute.
# TO DELETE:
# eJohSB <- function(x, eps = NULL, se = FALSE) {
# ran <- range(x)
# if(is.null(eps)) eps <- 0.001 * diff(ran) # stabilize MLE
# pdf <- function(x, gamma, delta, xi, xila) {
# dJohSB(x, gamma, delta, xi, xila - xi)
# }
# cdf <- function(x, gamma, delta, xi, xila) {
# pJohSB(x, gamma, delta, xi, xila - xi)
# }
# fit <- fit_distr(
# x, pdf, cdf,
# method = "L-BFGS-B",
# start = c(gamma = 0.2, delta = 0.5, xi = ran[1]*0.9, xila = ran[2]*1.1),
# lower = c(gamma = -Inf, delta = 0, xi = 0, xila = ran[2] + eps),
# upper = c(gamma = Inf, delta = Inf, xi = ran[1] - eps, xila = Inf),
# hessian = TRUE,
# control = list(maxit = 500)
# )
# lambda <- unname(fit$par["xila"] - fit$par["xi"])
# out <- c(fit$par[-4], lambda = lambda)
# if(isTRUE(se)) {
# vcov <- solve(fit$hessian)
# var_lambda <- unname(vcov["xi","xi"] + vcov["xila","xila"] + 2*vcov["xi","xila"])
# attr(out, "se") <- sqrt(c(diag(vcov)[-4], lambda = var_lambda))
# }
# out
# }
# in:
# x - numeric vector to be fitted
# lim - outer limits for values of the fitted random variable
# eps - a positive real number close to zero, tuning parameter
# se - logical, whether standard errors should be computed
# out: named vector of estimated parameters with optional attribute 'se'
eJohSB <- function(x, lim = c(0,Inf), eps = NULL, se = FALSE) {
ran <- range(x)
if(lim[1]>=ran[1] | lim[2]<=ran[2]) stop("Argument 'lim' is within the range of 'x'!")
if(is.null(eps)) eps <- 0.001 * diff(ran) # stabilize MLE
pdf <- function(x, gamma, delta, xi, xila) {
dJohSB(x, gamma, delta, xi, xila - xi)
}
cdf <- function(x, gamma, delta, xi, xila) {
pJohSB(x, gamma, delta, xi, xila - xi)
}
start_xi <- max((lim[1]+ran[1])/2, ran[1]*0.9) # in case lim contains infinity
start_xila <- min((ran[2]+lim[2])/2, ran[2]*1.1)
up_xi <- max((lim[1]+3*ran[1])/4, ran[1]-eps) # in case eps is too big
lo_xila <- min((3*ran[2]+lim[2])/4, ran[2]+eps)
fit <- fit_distr(
x, pdf, cdf,
method = "L-BFGS-B",
start = c(gamma = 0.2, delta = 0.5,
xi = start_xi, xila = start_xila),
lower = c(gamma = -Inf, delta = 0, xi = lim[1], xila = lo_xila),
upper = c(gamma = Inf, delta = Inf, xi = up_xi, xila = lim[2]),
hessian = se,
control = list(maxit = 500)
)
lambda <- unname(fit$par["xila"] - fit$par["xi"])
out <- c(fit$par[-4], lambda = lambda) |>
`attr<-`("method", fit$type)
if(isTRUE(se)) {
vcov <- solve(fit$hessian)
var_lambda <- unname(vcov["xi","xi"] + vcov["xila","xila"] + 2*vcov["xi","xila"])
attr(out, "se") <- sqrt(c(diag(vcov)[-4], lambda = var_lambda))
}
out
}
# example use:
# pJohSB(dat$Duration, gamma = 0.1, delta = 0.5, xi = 2, lambda = 6)
# or with rlang injection:
# tmp <- list(gamma = 0.1, delta = 0.5, xi = 2, lambda = 6)
# pJohSB(dat$Duration, !!!tmp) |> rlang::inject()
# dJohSB(dat$Duration, !!!tmp) |> rlang::inject()
# qJohSB(c(0,0.5,1), !!!tmp) |> rlang::inject()
# rJohSB(10, !!!tmp) |> rlang::inject()
# eJohSB(dat$Volume)Estimation
funs_Mpar_mle <- dat |>
dplyr::select(-index) |>
lapply(FUN = function(data) {
pars <- as.list(eJohSB(data))
list(joh = list(
pdf = function(x) dJohSB(x, !!!pars) |> rlang::inject(),
cdf = function(q) pJohSB(q, !!!pars) |> rlang::inject(),
qua = function(p) qJohSB(p, !!!pars) |> rlang::inject()
))
})2.2.4.3 Combine
Choose what to keep!
funs_Mpar <- mapply(FUN = c,
funs_Mpar_lmo[],
funs_Mpar_mle[],
SIMPLIFY = FALSE)Calculation in grid
grid1_Mpar_long <- purrr::imap(
.x = funs_Mpar,
.f = function(x, i) { # variable layer
purrr::imap(
.x = x,
.f = function(y, j) { # family layer
df <- dplyr::filter(grid1_long, variable == i)
data.frame(variable = i,
family = j,
value = df$value,
pdf = funs_Mpar[[i]][[j]]$pdf(df$value),
cdf = funs_Mpar[[i]][[j]]$cdf(df$value)
)
}
) |> dplyr::bind_rows()
}
) |> dplyr::bind_rows() |>
dplyr::mutate(variable = forcats::as_factor(variable))Calculation on observed data
dat_Mpar_long <- purrr::imap(
.x = funs_Mpar,
.f = function(x, i) { # variable layer
purrr::imap(
.x = x,
.f = function(y, j) { # family layer
df <- dat_long |>
dplyr::filter(variable == i) |>
dplyr::mutate(
family = j,
pdf = funs_Mpar[[i]][[j]]$pdf(value),
cdf = funs_Mpar[[i]][[j]]$cdf(value)
)
#data.frame(index =
# variable = i,
# family = j,
# value = df$value,
# pdf = funs_Mpar[[i]][[j]]$pdf(df$value),
# cdf = funs_Mpar[[i]][[j]]$cdf(df$value)
# )
}
) |> dplyr::bind_rows()
}
) |> dplyr::bind_rows() |>
dplyr::mutate(variable = forcats::as_factor(variable))2.2.5 Unite model functions
funs_Mall <- mapply(FUN = c,
list(ks = funs_Mks) |> purrr::transpose(),
funs_Mpar,
SIMPLIFY = FALSE)2.3 Decision
2.3.1 GoF comparison
2.3.1.1 Visual
p <- ggplot() + aes(x = value, y = cdf) +
geom_step(data = dat_Memp_long) +
geom_line(data = grid1_Mks_long) +
geom_line(data = grid1_Mpar_long, aes(color = family)) +
facet_wrap(vars(variable), ncol = 1, scales = "free_x") +
xlab("")
plotly::ggplotly(p, height = 800)A note to the above interactive plot: To deselect all but one colored curves, double-click the legend entry of your choice. To show all, reload the page.
2.3.1.2 Numeric
gof_marg <- dat_Mks_long |>
dplyr::mutate(family = "ks") |>
rbind(dat_Mpar_long) |>
dplyr::rename(model = family) |>
dplyr::select(-pdf) |>
dplyr::full_join(dat_Memp_long,
by = dplyr::join_by(index, variable, value),
suffix = c(".par", ".emp")
) |>
dplyr::summarise(MSE = mean((cdf.emp - cdf.par)^2), .by = c(variable, model)) |>
dplyr::arrange(variable, MSE) |>
dplyr::mutate(model = forcats::as_factor(model))
gof_marg |> dplyr::slice_head(n = 3, by = variable)# A tibble: 9 × 3
variable model MSE
<fct> <fct> <dbl>
1 Qmax gno 0.000894
2 Qmax kap 0.00111
3 Qmax wak 0.00111
4 Volume joh 0.000688
5 Volume gld 0.000744
6 Volume ln3 0.000784
7 Duration wak 0.000626
8 Duration ks 0.00102
9 Duration gld 0.00134
gof_marg |>
ggplot() + aes(x = model, y = MSE, color = variable) + geom_point() +
#facet_wrap(facets = vars(variable), ncol = 1) +
theme(legend.position = c(0.2,0.8))2.3.2 Selection
Choose what marginals model to continue with!
marg_id <- c(Qmax = "gev", Volume = "wei", Duration = "joh")Model functions in one object
marg_fu <- marg_id |>
purrr::imap(.f = function(x, i) {
funs_Mall[[i]][[x]]
})Last check of GoF
marg_fu |>
purrr::imap(.f = function(x, i) {
grid1_long |>
dplyr::filter(variable == i) |>
dplyr::mutate(cdf = x$cdf(value))
}) |>
purrr::list_rbind() |> # more-or-less the same as dplyr::bind_rows
ggplot() + aes(x = value, y = cdf) +
geom_step(data = dplyr::mutate(dat_Memp_long, data = "empirical")) +
geom_line(color = 4) +
facet_wrap(vars(variable), nrow = 1, scales = "free_x") +
xlab("")Kolmogorov-Smirnov, Cramer-von Mises and Anderson-Darling GoF test
marg_fu |>
purrr::imap(.f = function(x, i) {
dat_long |>
dplyr::filter(variable == i) |>
dplyr::summarize(
KS = ks.test(value, y = x$cdf)$p.value,
CvM = goftest::cvm.test(value, null = x$cdf)$p.value,
AD = goftest::ad.test(value, null = x$cdf)$p.value
) |>
round(4)
}) |>
purrr::list_rbind(names_to = "variable")Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `KS = ks.test(value, y = x$cdf)$p.value`.
Caused by warning in `ks.test.default()`:
! ties should not be present for the Kolmogorov-Smirnov test
# A tibble: 3 × 4
variable KS CvM AD
<chr> <dbl> <dbl> <dbl>
1 Qmax 0.924 0.845 0.853
2 Volume 0.950 0.926 0
3 Duration 0.36 0.352 0.310
Strange behaviour of Anderson-Darling test. Subject to check.
3 Dependence
vars_comb <- levels(dat_long$variable) |>
combn(m = 2) |>
t() |> data.frame() 3.1 Single measure
Correlation coefficients (cor) and test of significance p-value (pval):
- Pearson’s (pea),
- Spearman’s (spe).
cor_test_comb <- vars_comb |>
dplyr::summarize(
pea = cor.test(dat[[X1]], dat[[X2]])[c("estimate","p.value")] |>
unlist() |> setNames(c("cor","pval")) |> as.list() |> data.frame(),
spe = cor.test(dat[[X1]], dat[[X2]], method = "spearman", exact = FALSE) |>
(`[`)(c("estimate","p.value")) |>
unlist() |> setNames(c("cor","pval")) |> as.list() |> data.frame(),
.by = c(X1,X2)
) |>
tidyr::unnest(cols = c(pea, spe), names_sep = ".") |>
dplyr::mutate(across(-c(X1,X2), ~round(.x, 4)))
cor_test_comb |> as.data.frame() X1 X2 pea.cor pea.pval spe.cor spe.pval
1 Qmax Volume 0.7616 0.0000 0.8643 0.0000
2 Qmax Duration 0.1309 0.4679 0.4108 0.0175
3 Volume Duration 0.5545 0.0008 0.7234 0.0000
Partial correlation matrix
dat[,-1] |>
cov() |>
solve() |>
cov2cor() |>
(`*`)(2*diag(1,ncol(dat[,-1]),ncol(dat[,-1]))-1) |>
round(2) Qmax Volume Duration
Qmax 1.00 0.84 -0.54
Volume 0.84 1.00 0.71
Duration -0.54 0.71 1.00
dat[-1] |>
cov() |>
qgraph::qgraph(edge.labels=TRUE, graph="pcor",
edge.label.cex=2, edge.label.margin = 0.05, label.cex=1.3)3.2 Visual exploration
Observed data and Pearson correlation coefficient (with p-value of its significance test)
cor_test_comb |>
dplyr::select(X1, X2, pea.cor, pea.pval) |>
purrr::pwalk(.f = function(X1, X2, pea.cor, pea.pval) {
p <- ggplot(data = dat) +
aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point() +
geom_text(data = data.frame(),
aes(label = paste0(round(pea.cor, 2), " (", round(pea.pval, 3), ")")),
x=Inf, y=-Inf, hjust = 1.5, vjust = -2, size=3)
print(p)
})Observations transformed by empirical CDF and Spearman correlation coefficient (with p-value of its significance test) displayed in the corner
cor_test_comb |>
dplyr::select(X1, X2, spe.cor, spe.pval) |>
purrr::pwalk(.f = function(X1, X2, spe.cor, spe.pval) {
p <- ggplot(data = dat_Memp) +
aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point() +
geom_text(data = data.frame(),
aes(label = paste0(round(spe.cor, 2), " (", round(spe.pval, 3), ")")),
x = Inf, y = -Inf, hjust = 1.2, vjust = -1.5, size = 3)
print(p)
}) Pseudo-observations with standard normal marginals and density level curves (to be compared with bivariate normal density which is elliptically contoured)
dat_Memp_qnorm <- dat_Memp |>
dplyr::mutate(across(-index, qnorm))
vars_comb |>
purrr::pwalk(.f = function(X1, X2) {
p <- ggplot(data = dat_Memp_qnorm) +
aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point() +
geom_density2d(adjust = c(1.3,1.3))
print(p)
}) 3.3 Copula
# dataset in matrix form and without indices
dat_mat <- dat[-1] |> as.matrix()
dat_Memp_mat <- dat_Memp[,-1] |> as.matrix()3.3.1 Nonparametric
Empirical copula with no or specified smoothing. See Hofert et al. (2018), chapter 4.2 for details on smoothing methods (and their meaning for small samples) and chapter 6.1 for discussion about dealing with ties (effect on estimation).
funs_Cnp <- c(emp = "none", chb = "checkerboard", beta = "beta") |>
lapply(FUN = function(type) {
obj <- copula::empCopula(dat_Memp_mat, ties.method = "average", smoothing = type)
list( # return
obj = obj,
pdf = function(x) copula::dCopula(x, copula = obj),
cdf = function(x) copula::pCopula(x, copula = obj),
ran = function(n) copula::rCopula(n, copula = obj) |>
(`colnames<-`)(colnames(dat_Memp_mat))
# because beta fails to pass colnames (copula version 1.1-2)
)
}
)
# examples:
# funs_Cnp$emp$cdf(rep(0.5,3))
# funs_Cnp$chb$cdf(rep(0.5,3))
# funs_Cnp$beta$cdf(rep(0.5,3))
# results: 0.3636364, 0.3333333, 0.29907093.3.2 Elliptical
(Correct name: copulas of elliptically contoured distributions.)
Estimation
funs_Cpar_ell <- c(nor = "normal", tco = "t") |>
lapply(FUN = function(fam) {
obj <- copula::ellipCopula( # initial constructor
fam,
# param = cor(dat_mat) |> (\(x) x[upper.tri(x)])(), # not necessary
dim = ncol(dat_mat),
dispstr = "un"
) |>
copula::fitCopula(data = dat_Memp_mat) |> # estimation
(`@`)("copula") # copula object extraction
if(fam == "t") { # round df, otherwise pCopula fails
df_ind <- length(obj@parameters) # index of df (last parameter)
obj@parameters[df_ind] <- obj@parameters[df_ind] |> round() |> max(1)
}
list( # return
obj = obj,
pdf = function(x) copula::dCopula(x, copula = obj),
cdf = function(u) copula::pCopula(u, copula = obj),
ran = function(n) copula::rCopula(n, copula = obj) |>
(`colnames<-`)(colnames(dat_mat))
)
})Warning in var.mpl(copula, u): the covariance matrix of the parameter estimates
is computed as if 'df.fixed = TRUE' with df = 7.11485542432891
# examples:
# funs_Cpar_ell$nor$pdf(c(0.5,0.5,0.5)) # result: 4.268227
# funs_Cpar_ell$tco$cdf(c(0.5,0.5,0.5)) # result: 0.31756863.3.3 Vine
Specify admissible families and particular vine structure via R-vine matrix!
families_vine <- NA # or 0:6 for standard 1-parameter bicopulas
structures_vine <- list(
vi1 = rbind( # 1st variable as root
c(2, 0, 0),
c(3, 3, 0),
c(1, 1, 1)
),
vi2 = rbind( # 2nd variable as root
c(1, 0, 0),
c(3, 3, 0),
c(2, 2, 2)
),
vi3 = rbind( # 3rd variable as root
c(2, 0, 0),
c(1, 1, 0),
c(3, 3, 3)
)
)Estimation
# first, define vine objects
funs_Cpar_vine <- structures_vine |>
lapply(function(rvm) {
list(obj = VineCopula::RVineCopSelect(
data = dat_Memp_mat,
familyset = families_vine,
Matrix = rvm,
rotations = TRUE
))
})
# funs_Cpar_vine <- list() # uncomment for automatic structure selection only
funs_Cpar_vine$vin <- list(obj = VineCopula::RVineStructureSelect( # auto
dat_Memp_mat,
familyset = families_vine,
indeptest = TRUE
))
funs_Cpar_vine$vin$obj$Matrix [,1] [,2] [,3]
[1,] 1 0 0
[2,] 3 2 0
[3,] 2 3 3
# second, make functions (CDF via smoothed empirical copula of simulated data)
funs_Cpar_vine <- funs_Cpar_vine |>
lapply(function(vine) {
ran <- function(n) VineCopula::RVineSim(n, RVM = vine$obj)
simdata <- ran(nrow(dat)*1000)
list( # return
obj = vine$obj,
pdf = function(x) VineCopula::RVinePDF(x, RVM = vine$obj),
cdf = function(u) copula::C.n(u, X = simdata, smoothing = "none"),
ran = ran
)
})
# examples:
# funs_Cpar_vine$vin$pdf(c(0.5,0.5,0.5)) # result: 5.367162
# funs_Cpar_vine$vin$cdf(c(0.5,0.5,0.5)) # result: 0.319...
# funs_Cpar_vine$vin$ran(2)
# result (random):
# Qmax Volume Duration
# [1,] 0.6301589 0.5407897 0.2184795
# [2,] 0.4032000 0.2401318 0.1016480
# sapply(funs_Cpar_vine, function(x) x$pdf(c(0.5,0.5,0.5)))
# result:
# vi1 vi2 vi3 vin
# 5.396515 5.367162 4.410513 5.367162
# obviously the 2nd structure is also the choice of automatic selection3.3.4 GoF comparison
Gather models to single list
funs_Call <- c(funs_Cnp[c("chb", "beta")],
funs_Cpar_ell,
funs_Cpar_vine
)3.3.4.1 Numerical
Goodness-of-fit via MSE to empirical copula
local({
cemp <- copula::F.n(x = dat_Memp_mat, X = dat_Memp_mat)
sapply(funs_Call, function(cpar) {
mean((cemp - cpar$cdf(dat_Memp_mat))^2)
})
}) |>
signif(3) |>
sort() chb vi2 tco vi1 vi3 vin nor beta
0.000810 0.000859 0.000861 0.000869 0.000907 0.000914 0.001040 0.001500
One should be careful with performance of checkerboard copula since it arose from empirical copula and the same it is compared to.
3.3.4.2 Visual
Kendall’s distribution function. The lower black, straight line corresponds to co-monotonicity copula, the upper black curve corresponds to independence. The stairs represent empirical version.
funs_Call |>
lapply(function(cop) {
sim <- cop$ran(1000)
data.frame(z = copula::F.n(sim, sim)) |>
dplyr::mutate(cdf = rank(z)/(length(z)))
}) |>
dplyr::bind_rows(.id = "copula") |>
dplyr::mutate(copula = forcats::as_factor(copula)) |>
dplyr::arrange(copula, z) |>
ggplot() + aes(x = z, y = cdf) +
geom_step(data = dplyr::mutate(data.frame(z = funs_Cnp$emp$cdf(dat_Memp_mat)),
cdf = rank(z)/length(z))
) +
geom_line(aes(color = copula)) +
geom_line(data = dplyr::mutate(data.frame(z = seq(0.01,1,by=0.01)),
cdf = z * (1 - log(z) + log(z)^2/2))
) +
geom_line(data = data.frame(z = c(0,1), cdf = c(0,1)))3.3.5 Selection
Choose a copula from the fitted!
copu_id <- "nor"
copu_fu <- funs_Call[[copu_id]]Pairwise density contour plots of simulations from distribution with selected copula and standard normal margins.
sim_Csel_qnorm <- copu_fu$ran(10000) |>
data.frame() |>
dplyr::mutate(across(everything(), qnorm))
vars_comb |>
purrr::pwalk(.f = function(X1, X2) {
p <- ggplot(data = sim_Csel_qnorm) +
aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point(col = "grey") +
geom_point(data = dat_Memp_qnorm) +
geom_density2d(adjust = c(1.5,1.5))
print(p)
}) Class specific visuals
if(stringr::str_starts(copu_id, "vi")) { # vine copulas
contour(copu_fu$obj)
set.seed(2)
plot(copu_fu$obj, type = 1, edge.labels = "family-tau", label.pad = 0.1)
names(dat)[-1] |> setNames(1:3)
}4 Joint distribution
Model functions
join_fu <- list(
cdf = function(q) {
q <- as.data.frame(q)
u <- mapply(function(marg,vec) marg$cdf(vec), marg_fu, q) # C(F1(x1),...)
copu_fu$cdf(u)
},
pdf = function(x) {
x <- as.data.frame(x)
u <- mapply(function(marg,vec) marg$cdf(vec), marg_fu, x) |> rbind()
pdfx <- mapply(function(marg,vec) marg$pdf(vec), marg_fu, x) |> rbind()
copu_fu$pdf(u) * apply(pdfx, 1, prod) # c(F1(x1),...)*f1(x1)*f2(x2)*f3(x3)
},
ran = function(n) {
sim <- copu_fu$ran(n) # simulate from copula, then turn to quantiles
mapply(function(marg,vec) marg$qua(vec), marg_fu, as.data.frame(sim))
}
)—— UNDER CONSTRUCTION ——