library(ggplot2) # grammar of graphics package
library(patchwork) # combining ggplots
theme_set(theme_bw()) # ggplot white background theme
Modelling joint distribution of three hydrologic variables
Floof peak, volume and duration on the Parná river
1 Initial
1.1 Libraries and settings
1.2 Data import
<- read.table("data/HO_summer.csv",
dat header = TRUE, sep = ";", dec = ".", check.names = FALSE) |>
::transmute(index = Poradie,
dplyrQmax = `Kulminacny prietok [m3/s]`,
Volume = `Objem vlny [mil. m3]`,
Duration = `Trvanie povodne [h]`/24)
str(dat)
'data.frame': 33 obs. of 4 variables:
$ index : int 1 2 3 4 5 6 7 8 9 10 ...
$ Qmax : num 1.393 0.559 0.67 1.111 0.877 ...
$ Volume : num 0.1787 0.0456 0.1145 0.2481 0.0526 ...
$ Duration: num 4.38 3.04 3.92 5.33 3.46 ...
<- dat |>
dat_long ::pivot_longer(cols = - index, names_to = "variable", values_to = "value") |>
tidyr::mutate(variable = forcats::as_factor(variable))
dplyrstr(dat_long)
tibble [99 × 3] (S3: tbl_df/tbl/data.frame)
$ index : int [1:99] 1 1 1 2 2 2 3 3 3 4 ...
$ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 2 3 1 2 3 1 2 3 1 ...
$ value : num [1:99] 1.393 0.1787 4.375 0.559 0.0456 ...
2 Marginal distribution
2.1 Raw observations
|>
dat_long ::mutate(variable = dplyr::recode(variable,
dplyrQmax = "Qmax [m3/s]",
Volume = "Volume [mil m3]",
Duration = "Duration [day]")
|>
) ggplot() + aes(x = index, y = value) +
geom_point() +
facet_grid(rows = vars(variable), scales = "free_y") +
ylab("")
2.2 Estimation
2.2.1 Helper functions and objects
One dimensional grid for visualization
# grid in wide format would be confusing
# because of placing unrelated values in one row
# grid1 <- dat |>
# dplyr::reframe(across(-index, ~ modelr::seq_range(.x, n = 20)))
<- dat_long |>
grid1_long ::reframe(value = modelr::seq_range(value, n = 50),
dplyr.by = variable)
str(grid1_long)
tibble [150 × 2] (S3: tbl_df/tbl/data.frame)
$ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 1 1 1 1 1 1 1 1 1 ...
$ value : num [1:150] 0.089 0.778 1.467 2.157 2.846 ...
Fit univariate distribution by maximum-likelihood (ML) method. Starting values of parameters are refined by least-squares (LS) method.
# arguments:
# x - data to fit
# pdf, cdf - functions with data as first argument
# start - valid parameters to start with
<- function(x, pdf, cdf, start, ...) {
fit_distr <- rank(x) / (length(x)+1)
Ecdf <- function(pars) {
fun sum((Ecdf - cdf(x, !!!pars))^2) |> rlang::inject() # or do.call
}<- optim(par = start, fn = fun, ...)
fit <- function(pars) {
fun - sum(log(pdf(x, !!!pars))) |> rlang::inject()
}<- try(optim(par = (9*fit$par + 1*start)/10, fn = fun, ...), silent = TRUE)
out # the convex combination lowers the possibility of failure on boundary
if(inherits(out, "try-error")) {
<- fit |> c(type = "LS")
out else {
} <- out |> c(type = "ML")
out
}
out }
2.2.2 Empirical
Empirical density (histogram)
|>
dat_long ggplot() + aes(x = value) +
geom_histogram(bins = 10, color = "white") +
facet_grid(cols = vars(variable), scales = "free_x") +
xlab("")
Empirical CDF (standardized ranks, plotting position)
<- dat_long |>
dat_Memp_long ::mutate(cdf = rank(value) / (dplyr::n() + 1), .by = variable)
dplyr
|>
dat_Memp_long ggplot() + aes(x = value, y = cdf) +
geom_step() +
facet_grid(cols = vars(variable), scales = "free_x") +
xlab("")
Pseudo-observations (CDF transformed data)
<- dat_Memp_long |>
dat_Memp ::pivot_wider(id_cols = c(index), names_from = variable, values_from = cdf) tidyr
2.2.3 Kernel smoothing
It is necessary to install probhat package and its dependencies (1.) kubik, (2.) barsurf.
# nested list (variable / function)
<- dat |>
funs_Mks ::select(-index) |>
dplyr::map(.f = \(x) purrr::map(
purrr.x = c(cdf = "cdfuv.cks", pdf = "pdfuv.cks", qua = "qfuv.cks"),
.f = \(y) do.call(y, list(x = x, a = 0), envir = asNamespace("probhat"))
))
<- purrr::map(
dat_Mks_long .x = unique(dat_long$variable),
.f = \(x) {
|>
dat_long ::filter(variable == x) |>
dplyr::mutate(pdf = funs_Mks[[x]]$pdf(value),
dplyrcdf = funs_Mks[[x]]$cdf(value))
|>
}) ::bind_rows()
dplyr
<- purrr::map(
grid1_Mks_long .x = unique(dat_long$variable),
.f = \(x) {
|>
grid1_long ::filter(variable == x) |>
dplyr::mutate(pdf = funs_Mks[[x]]$pdf(value),
dplyrcdf = funs_Mks[[x]]$cdf(value))
|>
}) ::bind_rows()
dplyrstr(grid1_Mks_long)
tibble [150 × 4] (S3: tbl_df/tbl/data.frame)
$ variable: Factor w/ 3 levels "Qmax","Volume",..: 1 1 1 1 1 1 1 1 1 1 ...
$ value : num [1:150] 0.089 0.778 1.467 2.157 2.846 ...
$ pdf : num [1:150] 0.198 0.251 0.252 0.193 0.133 ...
$ cdf : num [1:150] 0.0216 0.186 0.3461 0.5069 0.6207 ...
ggplot() + aes(x = value) +
geom_histogram(aes(y = after_stat(density)), data = dat_long,
bins = 10, color = "white", fill = "grey") +
#geom_density(data = dat_long, bounds = c(0, Inf)) +
geom_line(aes(y = pdf), data = grid1_Mks_long) +
facet_wrap(vars(variable), scales = "free")
ggplot() + aes(x = value, y = cdf) +
geom_step(data = dat_Memp_long, color = "grey") +
geom_line(data = grid1_Mks_long) +
facet_wrap(vars(variable), scales = "free_x") +
xlab("")
2.2.4 Parametric
2.2.4.1 L-moments
Parametric families of univariate distributions from package lmomco.
<- c("exp", "gam", "gev", "gld", "glo", "gno", "gov", "gpa", "gum", "kap", "lap", "ln3", "nor", "pe3", "ray", "revgum", "rice", "wak", "wei")
family_name_lmomco
# nested list (variable / family / function)
<- dat |>
funs_Mpar_lmo ::select(-index) |>
dplyrlapply(FUN = function(data) {
<- lmomco::lmoms(data)
lmoms lapply(family_name_lmomco |> (\(x) setNames(x,x))(), # to pass names
FUN = function(y) {
<- c("ln3", "kap", "gpa")
exceptions # tryCatch( # helps to identify family for which an error occurred # - start
if(y %in% exceptions) {
if(y == "ln3") pars <- lmomco::lmom2par(lmoms, type = y, zeta = 0)
if(y == "kap") pars <- lmomco::lmom2par(lmoms, type = y, snap.tau4=TRUE)
if(y == "gpa") pars <- lmomco::lmom2par(lmoms, type = y, xi = 0)
else {
} <- lmomco::lmom2par(lmoms, type = y)
pars
}#, error = function(e) { cat("with family", y,":\n"); print(e) }) # - end
list(
pdf = function(x) lmomco::par2pdf(x, para = pars),
cdf = function(q) lmomco::par2cdf(q, para = pars),
qua = function(p) lmomco::par2qua(p, para = pars)
)
}) })
2.2.4.2 Maximum likelihood
User-defined parametric family: the Johnson bounded distribution (type SB) introduced by Johnson (1949):
# conditions: xi < x < xi + lambda
# underlying conversion function between Johnson and N(0,1) distribution
<- function(x, gamma, delta, xi, lambda, to = c("norm", "Joh")) {
convert_Joh_norm <- function(x, a, b) (x-a)/b # standardization
g <- function(u) u/(1-u) # SB specific
f <- function(u) u/(1+u) # inverse to f
fi <- switch(to[1],
out norm = gamma + delta*log(f(g(x,xi,lambda))),
Joh = xi + lambda*fi(exp(g(x,gamma,delta)))
)
out
}# probability density function, PDF
<- function(x, gamma, delta, xi, lambda) {
dJohSB suppressWarnings(
<- convert_Joh_norm(x, gamma, delta, xi, lambda)
z # suppress 'NaNs produced' warning
) # pdfJ(x) = pdfN01(z) * |dz(x)/dx|
<- dnorm(z) * delta * lambda / (x-xi) / (xi + lambda - x)
out <= xi | x >= xi + lambda] <- 0
out[x
out
}# cumulative distribution function, CDF
<- function(q, gamma, delta, xi, lambda) {
pJohSB suppressWarnings(
<- convert_Joh_norm(q, gamma, delta, xi, lambda)
z # suppress 'NaNs produced' warning
) <- pnorm(z)
out <= xi] <- 0
out[q >= xi + lambda] <- 1
out[q
out
}# quantile function
<- function(p, gamma, delta, xi, lambda) {
qJohSB <- p |>
out qnorm() |>
convert_Joh_norm(gamma, delta, xi, lambda, to = "Joh")
is.nan(out)] <- xi + lambda
out[
out
}# random generator
<- function(n, gamma, delta, xi, lambda) {
rJohSB rnorm(n) |>
convert_Joh_norm(gamma, delta, xi, lambda, to = "Joh")
}# Function to calculate ML estimates with constraints: lim[1] <= xi < min(x) & max(x) < xi + lambda <= lim[2]
# Standard errors are optionally returned as "se" attribute.
# TO DELETE:
# eJohSB <- function(x, eps = NULL, se = FALSE) {
# ran <- range(x)
# if(is.null(eps)) eps <- 0.001 * diff(ran) # stabilize MLE
# pdf <- function(x, gamma, delta, xi, xila) {
# dJohSB(x, gamma, delta, xi, xila - xi)
# }
# cdf <- function(x, gamma, delta, xi, xila) {
# pJohSB(x, gamma, delta, xi, xila - xi)
# }
# fit <- fit_distr(
# x, pdf, cdf,
# method = "L-BFGS-B",
# start = c(gamma = 0.2, delta = 0.5, xi = ran[1]*0.9, xila = ran[2]*1.1),
# lower = c(gamma = -Inf, delta = 0, xi = 0, xila = ran[2] + eps),
# upper = c(gamma = Inf, delta = Inf, xi = ran[1] - eps, xila = Inf),
# hessian = TRUE,
# control = list(maxit = 500)
# )
# lambda <- unname(fit$par["xila"] - fit$par["xi"])
# out <- c(fit$par[-4], lambda = lambda)
# if(isTRUE(se)) {
# vcov <- solve(fit$hessian)
# var_lambda <- unname(vcov["xi","xi"] + vcov["xila","xila"] + 2*vcov["xi","xila"])
# attr(out, "se") <- sqrt(c(diag(vcov)[-4], lambda = var_lambda))
# }
# out
# }
# in:
# x - numeric vector to be fitted
# lim - outer limits for values of the fitted random variable
# eps - a positive real number close to zero, tuning parameter
# se - logical, whether standard errors should be computed
# out: named vector of estimated parameters with optional attribute 'se'
<- function(x, lim = c(0,Inf), eps = NULL, se = FALSE) {
eJohSB <- range(x)
ran if(lim[1]>=ran[1] | lim[2]<=ran[2]) stop("Argument 'lim' is within the range of 'x'!")
if(is.null(eps)) eps <- 0.001 * diff(ran) # stabilize MLE
<- function(x, gamma, delta, xi, xila) {
pdf dJohSB(x, gamma, delta, xi, xila - xi)
}<- function(x, gamma, delta, xi, xila) {
cdf pJohSB(x, gamma, delta, xi, xila - xi)
}<- max((lim[1]+ran[1])/2, ran[1]*0.9) # in case lim contains infinity
start_xi <- min((ran[2]+lim[2])/2, ran[2]*1.1)
start_xila <- max((lim[1]+3*ran[1])/4, ran[1]-eps) # in case eps is too big
up_xi <- min((3*ran[2]+lim[2])/4, ran[2]+eps)
lo_xila <- fit_distr(
fit
x, pdf, cdf, method = "L-BFGS-B",
start = c(gamma = 0.2, delta = 0.5,
xi = start_xi, xila = start_xila),
lower = c(gamma = -Inf, delta = 0, xi = lim[1], xila = lo_xila),
upper = c(gamma = Inf, delta = Inf, xi = up_xi, xila = lim[2]),
hessian = se,
control = list(maxit = 500)
)<- unname(fit$par["xila"] - fit$par["xi"])
lambda <- c(fit$par[-4], lambda = lambda) |>
out `attr<-`("method", fit$type)
if(isTRUE(se)) {
<- solve(fit$hessian)
vcov <- unname(vcov["xi","xi"] + vcov["xila","xila"] + 2*vcov["xi","xila"])
var_lambda attr(out, "se") <- sqrt(c(diag(vcov)[-4], lambda = var_lambda))
}
out
}
# example use:
# pJohSB(dat$Duration, gamma = 0.1, delta = 0.5, xi = 2, lambda = 6)
# or with rlang injection:
# tmp <- list(gamma = 0.1, delta = 0.5, xi = 2, lambda = 6)
# pJohSB(dat$Duration, !!!tmp) |> rlang::inject()
# dJohSB(dat$Duration, !!!tmp) |> rlang::inject()
# qJohSB(c(0,0.5,1), !!!tmp) |> rlang::inject()
# rJohSB(10, !!!tmp) |> rlang::inject()
# eJohSB(dat$Volume)
Estimation
<- dat |>
funs_Mpar_mle ::select(-index) |>
dplyrlapply(FUN = function(data) {
<- as.list(eJohSB(data))
pars list(joh = list(
pdf = function(x) dJohSB(x, !!!pars) |> rlang::inject(),
cdf = function(q) pJohSB(q, !!!pars) |> rlang::inject(),
qua = function(p) qJohSB(p, !!!pars) |> rlang::inject()
)) })
2.2.4.3 Combine
Choose what to keep!
<- mapply(FUN = c,
funs_Mpar
funs_Mpar_lmo[],
funs_Mpar_mle[], SIMPLIFY = FALSE)
Calculation in grid
<- purrr::imap(
grid1_Mpar_long .x = funs_Mpar,
.f = function(x, i) { # variable layer
::imap(
purrr.x = x,
.f = function(y, j) { # family layer
<- dplyr::filter(grid1_long, variable == i)
df data.frame(variable = i,
family = j,
value = df$value,
pdf = funs_Mpar[[i]][[j]]$pdf(df$value),
cdf = funs_Mpar[[i]][[j]]$cdf(df$value)
)
}|> dplyr::bind_rows()
)
}|> dplyr::bind_rows() |>
) ::mutate(variable = forcats::as_factor(variable)) dplyr
Calculation on observed data
<- purrr::imap(
dat_Mpar_long .x = funs_Mpar,
.f = function(x, i) { # variable layer
::imap(
purrr.x = x,
.f = function(y, j) { # family layer
<- dat_long |>
df ::filter(variable == i) |>
dplyr::mutate(
dplyrfamily = j,
pdf = funs_Mpar[[i]][[j]]$pdf(value),
cdf = funs_Mpar[[i]][[j]]$cdf(value)
)#data.frame(index =
# variable = i,
# family = j,
# value = df$value,
# pdf = funs_Mpar[[i]][[j]]$pdf(df$value),
# cdf = funs_Mpar[[i]][[j]]$cdf(df$value)
# )
}|> dplyr::bind_rows()
)
}|> dplyr::bind_rows() |>
) ::mutate(variable = forcats::as_factor(variable)) dplyr
2.2.5 Unite model functions
<- mapply(FUN = c,
funs_Mall list(ks = funs_Mks) |> purrr::transpose(),
funs_Mpar, SIMPLIFY = FALSE)
2.3 Decision
2.3.1 GoF comparison
2.3.1.1 Visual
<- ggplot() + aes(x = value, y = cdf) +
p geom_step(data = dat_Memp_long) +
geom_line(data = grid1_Mks_long) +
geom_line(data = grid1_Mpar_long, aes(color = family)) +
facet_wrap(vars(variable), ncol = 1, scales = "free_x") +
xlab("")
::ggplotly(p, height = 800) plotly
A note to the above interactive plot: To deselect all but one colored curves, double-click the legend entry of your choice. To show all, reload the page.
2.3.1.2 Numeric
<- dat_Mks_long |>
gof_marg ::mutate(family = "ks") |>
dplyrrbind(dat_Mpar_long) |>
::rename(model = family) |>
dplyr::select(-pdf) |>
dplyr::full_join(dat_Memp_long,
dplyrby = dplyr::join_by(index, variable, value),
suffix = c(".par", ".emp")
|>
) ::summarise(MSE = mean((cdf.emp - cdf.par)^2), .by = c(variable, model)) |>
dplyr::arrange(variable, MSE) |>
dplyr::mutate(model = forcats::as_factor(model))
dplyr
|> dplyr::slice_head(n = 3, by = variable) gof_marg
# A tibble: 9 × 3
variable model MSE
<fct> <fct> <dbl>
1 Qmax gno 0.000894
2 Qmax kap 0.00111
3 Qmax wak 0.00111
4 Volume joh 0.000688
5 Volume gld 0.000744
6 Volume ln3 0.000784
7 Duration wak 0.000626
8 Duration ks 0.00102
9 Duration gld 0.00134
|>
gof_marg ggplot() + aes(x = model, y = MSE, color = variable) + geom_point() +
#facet_wrap(facets = vars(variable), ncol = 1) +
theme(legend.position = c(0.2,0.8))
2.3.2 Selection
Choose what marginals model to continue with!
<- c(Qmax = "gev", Volume = "wei", Duration = "joh") marg_id
Model functions in one object
<- marg_id |>
marg_fu ::imap(.f = function(x, i) {
purrr
funs_Mall[[i]][[x]] })
Last check of GoF
|>
marg_fu ::imap(.f = function(x, i) {
purrr|>
grid1_long ::filter(variable == i) |>
dplyr::mutate(cdf = x$cdf(value))
dplyr|>
}) ::list_rbind() |> # more-or-less the same as dplyr::bind_rows
purrrggplot() + aes(x = value, y = cdf) +
geom_step(data = dplyr::mutate(dat_Memp_long, data = "empirical")) +
geom_line(color = 4) +
facet_wrap(vars(variable), nrow = 1, scales = "free_x") +
xlab("")
Kolmogorov-Smirnov, Cramer-von Mises and Anderson-Darling GoF test
|>
marg_fu ::imap(.f = function(x, i) {
purrr|>
dat_long ::filter(variable == i) |>
dplyr::summarize(
dplyrKS = ks.test(value, y = x$cdf)$p.value,
CvM = goftest::cvm.test(value, null = x$cdf)$p.value,
AD = goftest::ad.test(value, null = x$cdf)$p.value
|>
) round(4)
|>
}) ::list_rbind(names_to = "variable") purrr
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `KS = ks.test(value, y = x$cdf)$p.value`.
Caused by warning in `ks.test.default()`:
! ties should not be present for the Kolmogorov-Smirnov test
# A tibble: 3 × 4
variable KS CvM AD
<chr> <dbl> <dbl> <dbl>
1 Qmax 0.924 0.845 0.853
2 Volume 0.950 0.926 0
3 Duration 0.36 0.352 0.310
Strange behaviour of Anderson-Darling test. Subject to check.
3 Dependence
<- levels(dat_long$variable) |>
vars_comb combn(m = 2) |>
t() |> data.frame()
3.1 Single measure
Correlation coefficients (cor) and test of significance p-value (pval):
- Pearson’s (pea),
- Spearman’s (spe).
<- vars_comb |>
cor_test_comb ::summarize(
dplyrpea = cor.test(dat[[X1]], dat[[X2]])[c("estimate","p.value")] |>
unlist() |> setNames(c("cor","pval")) |> as.list() |> data.frame(),
spe = cor.test(dat[[X1]], dat[[X2]], method = "spearman", exact = FALSE) |>
`[`)(c("estimate","p.value")) |>
(unlist() |> setNames(c("cor","pval")) |> as.list() |> data.frame(),
.by = c(X1,X2)
|>
) ::unnest(cols = c(pea, spe), names_sep = ".") |>
tidyr::mutate(across(-c(X1,X2), ~round(.x, 4)))
dplyr
|> as.data.frame() cor_test_comb
X1 X2 pea.cor pea.pval spe.cor spe.pval
1 Qmax Volume 0.7616 0.0000 0.8643 0.0000
2 Qmax Duration 0.1309 0.4679 0.4108 0.0175
3 Volume Duration 0.5545 0.0008 0.7234 0.0000
Partial correlation matrix
-1] |>
dat[,cov() |>
solve() |>
cov2cor() |>
`*`)(2*diag(1,ncol(dat[,-1]),ncol(dat[,-1]))-1) |>
(round(2)
Qmax Volume Duration
Qmax 1.00 0.84 -0.54
Volume 0.84 1.00 0.71
Duration -0.54 0.71 1.00
-1] |>
dat[cov() |>
::qgraph(edge.labels=TRUE, graph="pcor",
qgraphedge.label.cex=2, edge.label.margin = 0.05, label.cex=1.3)
3.2 Visual exploration
Observed data and Pearson correlation coefficient (with p-value of its significance test)
|>
cor_test_comb ::select(X1, X2, pea.cor, pea.pval) |>
dplyr::pwalk(.f = function(X1, X2, pea.cor, pea.pval) {
purrr<- ggplot(data = dat) +
p aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point() +
geom_text(data = data.frame(),
aes(label = paste0(round(pea.cor, 2), " (", round(pea.pval, 3), ")")),
x=Inf, y=-Inf, hjust = 1.5, vjust = -2, size=3)
print(p)
})
Observations transformed by empirical CDF and Spearman correlation coefficient (with p-value of its significance test) displayed in the corner
|>
cor_test_comb ::select(X1, X2, spe.cor, spe.pval) |>
dplyr::pwalk(.f = function(X1, X2, spe.cor, spe.pval) {
purrr<- ggplot(data = dat_Memp) +
p aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point() +
geom_text(data = data.frame(),
aes(label = paste0(round(spe.cor, 2), " (", round(spe.pval, 3), ")")),
x = Inf, y = -Inf, hjust = 1.2, vjust = -1.5, size = 3)
print(p)
})
Pseudo-observations with standard normal marginals and density level curves (to be compared with bivariate normal density which is elliptically contoured)
<- dat_Memp |>
dat_Memp_qnorm ::mutate(across(-index, qnorm))
dplyr|>
vars_comb ::pwalk(.f = function(X1, X2) {
purrr<- ggplot(data = dat_Memp_qnorm) +
p aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point() +
geom_density2d(adjust = c(1.3,1.3))
print(p)
})
3.3 Copula
# dataset in matrix form and without indices
<- dat[-1] |> as.matrix()
dat_mat <- dat_Memp[,-1] |> as.matrix() dat_Memp_mat
3.3.1 Nonparametric
Empirical copula with no or specified smoothing. See Hofert et al. (2018), chapter 4.2 for details on smoothing methods (and their meaning for small samples) and chapter 6.1 for discussion about dealing with ties (effect on estimation).
<- c(emp = "none", chb = "checkerboard", beta = "beta") |>
funs_Cnp lapply(FUN = function(type) {
<- copula::empCopula(dat_Memp_mat, ties.method = "average", smoothing = type)
obj list( # return
obj = obj,
pdf = function(x) copula::dCopula(x, copula = obj),
cdf = function(x) copula::pCopula(x, copula = obj),
ran = function(n) copula::rCopula(n, copula = obj) |>
`colnames<-`)(colnames(dat_Memp_mat))
(# because beta fails to pass colnames (copula version 1.1-2)
)
}
)
# examples:
# funs_Cnp$emp$cdf(rep(0.5,3))
# funs_Cnp$chb$cdf(rep(0.5,3))
# funs_Cnp$beta$cdf(rep(0.5,3))
# results: 0.3636364, 0.3333333, 0.2990709
3.3.2 Elliptical
(Correct name: copulas of elliptically contoured distributions.)
Estimation
<- c(nor = "normal", tco = "t") |>
funs_Cpar_ell lapply(FUN = function(fam) {
<- copula::ellipCopula( # initial constructor
obj
fam, # param = cor(dat_mat) |> (\(x) x[upper.tri(x)])(), # not necessary
dim = ncol(dat_mat),
dispstr = "un"
|>
) ::fitCopula(data = dat_Memp_mat) |> # estimation
copula`@`)("copula") # copula object extraction
(if(fam == "t") { # round df, otherwise pCopula fails
<- length(obj@parameters) # index of df (last parameter)
df_ind @parameters[df_ind] <- obj@parameters[df_ind] |> round() |> max(1)
obj
}list( # return
obj = obj,
pdf = function(x) copula::dCopula(x, copula = obj),
cdf = function(u) copula::pCopula(u, copula = obj),
ran = function(n) copula::rCopula(n, copula = obj) |>
`colnames<-`)(colnames(dat_mat))
(
) })
Warning in var.mpl(copula, u): the covariance matrix of the parameter estimates
is computed as if 'df.fixed = TRUE' with df = 7.11485542432891
# examples:
# funs_Cpar_ell$nor$pdf(c(0.5,0.5,0.5)) # result: 4.268227
# funs_Cpar_ell$tco$cdf(c(0.5,0.5,0.5)) # result: 0.3175686
3.3.3 Vine
Specify admissible families and particular vine structure via R-vine matrix!
<- NA # or 0:6 for standard 1-parameter bicopulas
families_vine <- list(
structures_vine vi1 = rbind( # 1st variable as root
c(2, 0, 0),
c(3, 3, 0),
c(1, 1, 1)
),vi2 = rbind( # 2nd variable as root
c(1, 0, 0),
c(3, 3, 0),
c(2, 2, 2)
),vi3 = rbind( # 3rd variable as root
c(2, 0, 0),
c(1, 1, 0),
c(3, 3, 3)
) )
Estimation
# first, define vine objects
<- structures_vine |>
funs_Cpar_vine lapply(function(rvm) {
list(obj = VineCopula::RVineCopSelect(
data = dat_Memp_mat,
familyset = families_vine,
Matrix = rvm,
rotations = TRUE
))
})
# funs_Cpar_vine <- list() # uncomment for automatic structure selection only
$vin <- list(obj = VineCopula::RVineStructureSelect( # auto
funs_Cpar_vine
dat_Memp_mat, familyset = families_vine,
indeptest = TRUE
))$vin$obj$Matrix funs_Cpar_vine
[,1] [,2] [,3]
[1,] 1 0 0
[2,] 3 2 0
[3,] 2 3 3
# second, make functions (CDF via smoothed empirical copula of simulated data)
<- funs_Cpar_vine |>
funs_Cpar_vine lapply(function(vine) {
<- function(n) VineCopula::RVineSim(n, RVM = vine$obj)
ran <- ran(nrow(dat)*1000)
simdata list( # return
obj = vine$obj,
pdf = function(x) VineCopula::RVinePDF(x, RVM = vine$obj),
cdf = function(u) copula::C.n(u, X = simdata, smoothing = "none"),
ran = ran
)
})
# examples:
# funs_Cpar_vine$vin$pdf(c(0.5,0.5,0.5)) # result: 5.367162
# funs_Cpar_vine$vin$cdf(c(0.5,0.5,0.5)) # result: 0.319...
# funs_Cpar_vine$vin$ran(2)
# result (random):
# Qmax Volume Duration
# [1,] 0.6301589 0.5407897 0.2184795
# [2,] 0.4032000 0.2401318 0.1016480
# sapply(funs_Cpar_vine, function(x) x$pdf(c(0.5,0.5,0.5)))
# result:
# vi1 vi2 vi3 vin
# 5.396515 5.367162 4.410513 5.367162
# obviously the 2nd structure is also the choice of automatic selection
3.3.4 GoF comparison
Gather models to single list
<- c(funs_Cnp[c("chb", "beta")],
funs_Call
funs_Cpar_ell,
funs_Cpar_vine )
3.3.4.1 Numerical
Goodness-of-fit via MSE to empirical copula
local({
<- copula::F.n(x = dat_Memp_mat, X = dat_Memp_mat)
cemp sapply(funs_Call, function(cpar) {
mean((cemp - cpar$cdf(dat_Memp_mat))^2)
})|>
}) signif(3) |>
sort()
chb vi2 tco vi1 vi3 vin nor beta
0.000810 0.000859 0.000861 0.000869 0.000907 0.000914 0.001040 0.001500
One should be careful with performance of checkerboard copula since it arose from empirical copula and the same it is compared to.
3.3.4.2 Visual
Kendall’s distribution function. The lower black, straight line corresponds to co-monotonicity copula, the upper black curve corresponds to independence. The stairs represent empirical version.
|>
funs_Call lapply(function(cop) {
<- cop$ran(1000)
sim data.frame(z = copula::F.n(sim, sim)) |>
::mutate(cdf = rank(z)/(length(z)))
dplyr|>
}) ::bind_rows(.id = "copula") |>
dplyr::mutate(copula = forcats::as_factor(copula)) |>
dplyr::arrange(copula, z) |>
dplyrggplot() + aes(x = z, y = cdf) +
geom_step(data = dplyr::mutate(data.frame(z = funs_Cnp$emp$cdf(dat_Memp_mat)),
cdf = rank(z)/length(z))
+
) geom_line(aes(color = copula)) +
geom_line(data = dplyr::mutate(data.frame(z = seq(0.01,1,by=0.01)),
cdf = z * (1 - log(z) + log(z)^2/2))
+
) geom_line(data = data.frame(z = c(0,1), cdf = c(0,1)))
3.3.5 Selection
Choose a copula from the fitted!
<- "nor"
copu_id <- funs_Call[[copu_id]] copu_fu
Pairwise density contour plots of simulations from distribution with selected copula and standard normal margins.
<- copu_fu$ran(10000) |>
sim_Csel_qnorm data.frame() |>
::mutate(across(everything(), qnorm))
dplyr|>
vars_comb ::pwalk(.f = function(X1, X2) {
purrr<- ggplot(data = sim_Csel_qnorm) +
p aes(x = .data[[X1]], y = .data[[X2]]) +
geom_point(col = "grey") +
geom_point(data = dat_Memp_qnorm) +
geom_density2d(adjust = c(1.5,1.5))
print(p)
})
Class specific visuals
if(stringr::str_starts(copu_id, "vi")) { # vine copulas
contour(copu_fu$obj)
set.seed(2)
plot(copu_fu$obj, type = 1, edge.labels = "family-tau", label.pad = 0.1)
names(dat)[-1] |> setNames(1:3)
}
4 Joint distribution
Model functions
<- list(
join_fu cdf = function(q) {
<- as.data.frame(q)
q <- mapply(function(marg,vec) marg$cdf(vec), marg_fu, q) # C(F1(x1),...)
u $cdf(u)
copu_fu
},pdf = function(x) {
<- as.data.frame(x)
x <- mapply(function(marg,vec) marg$cdf(vec), marg_fu, x) |> rbind()
u <- mapply(function(marg,vec) marg$pdf(vec), marg_fu, x) |> rbind()
pdfx $pdf(u) * apply(pdfx, 1, prod) # c(F1(x1),...)*f1(x1)*f2(x2)*f3(x3)
copu_fu
},ran = function(n) {
<- copu_fu$ran(n) # simulate from copula, then turn to quantiles
sim mapply(function(marg,vec) marg$qua(vec), marg_fu, as.data.frame(sim))
} )
—— UNDER CONSTRUCTION ——