lags <- -6:12 # Range of lags to examine (months)
lag_cols <- paste0("value_lag_",lags) # Col names for lagged values
# Order by time before shifting (grouped by station_id/paikka_id and drought index)
setorder(station_sgi_w_dr_inds, station_id, dr_ind, time)
setorder(pipe_sgi_w_dr_inds, paikka_id, dr_ind, time)
# Create the multiple shifted columns
temp_st_wide_lag_inds <- data.table::copy(station_sgi_w_dr_inds) # do not modif orig
temp_st_wide_lag_inds[, (lag_cols) := shift(value, lags), by=list(station_id, dr_ind)]
temp_p_wide_lag_inds <- data.table::copy(pipe_sgi_w_dr_inds) # do not modif orig
temp_p_wide_lag_inds[, (lag_cols) := shift(value, lags), by=list(station_id, dr_ind)]
# Shifted cols into long format. Kept cols are in `id.vars`
station_lag_inds <- melt(temp_st_wide_lag_inds,
id.vars = c("station_id", "time", "dr_ind","ind_type","ind_accum", "value", "sgi"),
measure.vars = lag_cols,
variable.name = "lag_months", value.name = "lagged_value"
)
pipe_lag_inds <- melt(temp_p_wide_lag_inds,
id.vars = c("paikka_id","station_id", "time", "dr_ind","ind_type","ind_accum", "value", "sgi"),
measure.vars = lag_cols,
variable.name = "lag_months", value.name = "lagged_value"
)
rm(temp_st_wide_lag_inds, temp_p_wide_lag_inds) # Cleanup temp vars
# Convert `lag_months` to contain only the numeric lag amount
station_lag_inds[, lag_months := as.integer(gsub("value_lag_", "", lag_months))]
pipe_lag_inds[, lag_months := as.integer(gsub("value_lag_", "", lag_months))]Standard Element 3
Model training, validation, and interpretation
drought indices, SGI, drought prediction
This notebook contains the “modelling” part of the project. Here the relationships between the SGI and drought indices are examined. This file also produces some summaries of the performance of various parameters.
Create temporally lagged versions of all the drought indices (-6 to +12 months). These are needed for the “modelling” part.
Calculate the Pearson’s correlation coefficients between all the drought indices and the SGI time series. This is done on both the station and the pipe level. The best-correlating combinations of drought index accumulation period and temporal lag are also saved.
# Function to re-format cor.test output into 1-long lists (output will stay as 1 row when applying in dt)
dt_cor.test <- function(...) {
res_list <- as.list(stats::cor.test(...))
res_list[["conf.int.lo"]] <- res_list[["conf.int"]][[1]]
res_list[["conf.int.hi"]] <- res_list[["conf.int"]][[2]]
res_list[["conf.int"]] <- NULL; stopifnot(lapply(res_list, length) == 1L)
return(res_list)
}
# Calculate correlations on pipe and station level
pipe_cor <- pipe_lag_inds[, dt_cor.test(sgi, lagged_value, method="pear"),
by=list(paikka_id,station_id, dr_ind,ind_type,ind_accum, lag_months)]
station_cor <- station_lag_inds[, dt_cor.test(sgi, lagged_value, method="pear"),
by=list(station_id, dr_ind,ind_type,ind_accum, lag_months)]
# SGI-droughtindex correlation plots
pipe_cor |>
ggplot(aes(ind_type, estimate)) +
geom_violin() +
ggforce::geom_sina(aes(alpha=p.value <= 0.01, group=dr_ind), show.legend=F) +
facet_grid(rows=vars(ind_accum), cols=vars(lag_months), scales="free_x") +
stat_summary(fun.data = "mean_se", color="red")Warning: Using alpha for a discrete variable is not advised.
station_cor |>
ggplot(aes(ind_type, estimate, group=interaction(ind_type,ind_accum))) +
geom_violin() +
ggforce::geom_sina(aes(alpha=p.value <= 0.01, group=dr_ind)) +
facet_grid(cols = vars(ind_accum), scales="free_x") +
stat_summary(fun.data = "mean_se", color="red")Warning: Using alpha for a discrete variable is not advised.
# geom_point(aes(alpha=p.value <= 0.01))
# Summarize all stations: what lag + accum combo is the best for each station & index
station_cor_smry <- station_cor[, .SD[which.max(estimate)],
by=list(ind_type, station_id)]
station_cor_smry[,stopifnot(!anyDuplicated(station_id)), by=ind_type] # validateEmpty data.table (0 rows and 1 cols): ind_type
# Summarize all pipes
pipe_cor_smry <- pipe_cor[, .SD[which.max(estimate)],
by=list(ind_type, paikka_id,station_id)]
pipe_cor_smry[,stopifnot(!anyDuplicated(paikka_id)), by=ind_type] # validateEmpty data.table (0 rows and 1 cols): ind_type
# Plot the correlations heatmap of 1 station
ggplot(station_cor[station_id=="0101"],
aes(lag_months, factor(ind_accum), fill=estimate)) +
geom_tile() +
geom_vline(xintercept=0, color="black") +
geom_point(data=station_cor_smry[station_id=="0101"], color="white", size=3, pch="cross", stroke=1) +
facet_grid(rows=vars(ind_type), cols=vars(station_id)) +
scico::scale_fill_scico("Correlation", palette="vik", midpoint=0) +
coord_cartesian(expand=F)# and pipes of 1 stations
ggplot(pipe_cor[station_id=="0902"],
aes(lag_months, factor(ind_accum), fill=estimate)) +
geom_tile() +
geom_vline(xintercept=0, color="black") +
stat_summary(data=pipe_cor_smry[station_id=="0902"], fun=max, color="white") +
facet_grid(rows=vars(ind_type), cols=vars(paikka_id)) +
scico::scale_fill_scico("Correlation", palette="vik", midpoint=0) +
coord_cartesian(expand=F)Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
# Plot summary of all stations
dplyr::group_by(station_cor_smry, ind_type) |>
dplyr::count(lag_months, ind_accum) |>
ggplot(aes(lag_months, ind_accum)) +
geom_tile(aes(fill=n)) +
scale_fill_viridis_c("Stations", transform="sqrt") +
geom_label(aes(label=n), alpha=0.5) +
facet_grid(rows=vars(ind_type)) +
scale_x_continuous(breaks=~round(unique(pretty(.)))) +
theme_bw()# Compare also which index is better overall (plot best correlations for all indices)
ggplot(station_cor_smry, aes(ind_type, estimate)) +
geom_violin(draw_quantiles = c(0.5), lwd=0.8) + ggforce::geom_sina(alpha=0.2) + theme_minimal()# Plot summary of all pipes
dplyr::group_by(pipe_cor_smry, ind_type) |>
dplyr::count(lag_months, ind_accum) |>
ggplot(aes(lag_months, ind_accum)) +
geom_tile(aes(fill=n)) +
scale_fill_viridis_c("Pipes", transform="sqrt") +
geom_label(aes(label=n), alpha=0.5) +
facet_grid(rows=vars(ind_type)) +
scale_x_continuous(breaks=~round(unique(pretty(.)))) +
theme_bw()ggplot(pipe_cor_smry, aes(ind_type, estimate)) +
geom_violin(draw_quantiles=c(0.5), lwd=0.8) + ggforce::geom_sina(alpha=0.2) + theme_minimal()# For some reasons these options give confusing results...
# heatmap(table(station_cor_smry[,.(lag_months,ind_accum)]))
# plot(table(station_cor_smry[,.(lag_months,ind_accum)]))
# Alt to `table`: xtabs(~ lag_months + ind_accum, data = station_cor_smry)Calculate the RMSE between all the drought indices and the SGI time series. This is done on both the station and the pipe level. Do a similar summarization of the best combinations.
pipe_rmse <- pipe_lag_inds[, .(RMSE=DescTools::RMSE(lagged_value, sgi, na.rm=T)), # SGI is ref
by=list(paikka_id,station_id, dr_ind,ind_type,ind_accum, lag_months)]
# Plot RMSEs of all combinations
pipe_rmse |>
ggplot(aes(ind_type, RMSE, group=interaction(ind_type,ind_accum))) +
geom_violin() +
ggforce::geom_sina(aes(group=dr_ind)) +
facet_grid(cols = vars(ind_accum), scales="free_x") +
stat_summary(fun.data = "mean_se", color="red")# Summarize RMSE of pipes (which lag + accum combo gives lowest value, for each pipe and index)
pipe_rmse_smry <- pipe_rmse[, .SD[which.min(RMSE)],
by=list(ind_type, paikka_id)]# Plot RMSE development on 1 station
ggplot(pipe_rmse[station_id=="0902"],
aes(lag_months, factor(ind_accum), fill=RMSE)) +
geom_tile() +
geom_vline(xintercept=0, color="black") +
geom_point(data=pipe_rmse_smry[station_id=="0902"], color="white", size=3, pch="cross", stroke=1) +
facet_grid(rows=vars(ind_type), cols=vars(paikka_id)) +
scico::scale_fill_scico("RMSE", palette="batlow") +
coord_cartesian(expand=F)# Plot summary of the best RMSEs of pipes
dplyr::group_by(pipe_rmse_smry, ind_type) |>
dplyr::count(lag_months, ind_accum) |>
ggplot(aes(lag_months, ind_accum)) +
geom_tile(aes(fill=n)) +
scale_fill_viridis_c("Pipes", transform="sqrt") +
geom_label(aes(label=n), alpha=0.5) +
facet_grid(rows=vars(ind_type)) +
scale_x_continuous(breaks=~round(unique(pretty(.)))) +
theme_bw()1 Auto- and cross-correlations
Look into how the SGI is autocorrelated, and how the SGI cross-correlates with other indices.
acf(pipe_sgi_long[paikka_id == 10550, sgi], na.action = na.pass)pipe_sgi_long[paikka_id %in% unique(paikka_id)[1:3]][, print(acf(sgi, na.action = na.pass)), by=paikka_id]
Autocorrelations of series 'sgi', by lag
0 1 2 3 4 5 6 7 8 9 10 11 12
1.000 0.625 0.401 0.279 0.253 0.206 0.160 0.146 0.150 0.157 0.167 0.156 0.174
13 14 15 16 17 18 19 20 21 22 23 24 25
0.125 0.077 0.066 0.089 0.099 0.119 0.127 0.095 0.055 0.116 0.169 0.186 0.091
26 27 28
0.019 0.021 0.034
Autocorrelations of series 'sgi', by lag
0 1 2 3 4 5 6 7 8 9 10 11 12
1.000 0.785 0.649 0.551 0.491 0.503 0.493 0.460 0.463 0.451 0.409 0.375 0.358
13 14 15 16 17 18 19 20 21 22 23 24 25
0.329 0.337 0.319 0.307 0.295 0.286 0.252 0.250 0.211 0.190 0.182 0.171 0.169
26 27 28
0.155 0.109 0.127
Autocorrelations of series 'sgi', by lag
0 1 2 3 4 5 6 7 8 9 10
1.000 0.616 0.409 0.280 0.227 0.198 0.159 0.116 0.120 0.110 0.083
11 12 13 14 15 16 17 18 19 20 21
0.047 0.049 -0.039 -0.032 -0.024 -0.031 -0.013 -0.008 -0.019 -0.009 -0.053
22 23 24 25 26 27 28
-0.081 -0.103 -0.059 -0.072 -0.032 -0.066 -0.059
Empty data.table (0 rows and 1 cols): paikka_id
ccf_test_spei <- station_spi_spei_ts[station_id == pipes[paikka_id == 10550, asema_tunnus] &
dr_ind == "SPEI12"
]
ccf_test_pipe <- pipe_sgi_long[paikka_id == 10550]
ccf_test <- merge(ccf_test_spei, ccf_test_pipe, by="time")
ccf(ccf_test$sgi, ccf_test$value, na.action = na.pass, lag.max=36)stats::cor.test(ccf_test$sgi, ccf_test$value, method="pearson")
Pearson's product-moment correlation
data: ccf_test$sgi and ccf_test$value
t = 23.261, df = 594, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6459293 0.7302459
sample estimates:
cor
0.6904253
plot(ccf_test$sgi, ccf_test$value)2 TODO
- How much does the correlation worsen when looking further into the future?
- calculate mean unsaturated zone thickness, look at the results based on that
- investigate if there is a spatial pattern to the differing responses to optimal lag+accumulation combination
- investigate how the absolute GW level fluctuation amplitude is tied to the SGI response
- use ImeytymisKerroin of GWAs of stations to look at the results?
- investigate the ranges at which the SGI ACF decays (m_max) and how that impacts the results
- look at how the ACF is tied to the optimal (highest SGI cor.) P accumulation of the sites
- calculate max & median drought durations from both SGI and the indices.
- look at how they are cross-correlated
- look at which of them show comparatively more drought periods over the whole data
- look at how they are tied to the ACF decay, and to the unsat.zone thickness
- calculate correlations of SGI and dr-indices only with <0 values. (seems to have worse corr.)
- also look at whether the unsat.zone thickness explains a site’s discrepency between >0 and <0 correlations