Standard Element 3

Model training, validation, and interpretation

Author

Affiliation

Pietari Pöykkö, MSc (tech)

University of Oulu

Published

2026-05-12

Keywords

drought indices, SGI, drought prediction

This notebook contains the “modelling” part of the project. Here the relationships between the SGI and drought indices are examined. This file also produces some summaries of the performance of various parameters.

Create temporally lagged versions of all the drought indices (-6 to +12 months). These are needed for the “modelling” part.

lags <- -6:12 # Range of lags to examine (months)
lag_cols <- paste0("value_lag_",lags) # Col names for lagged values

# Order by time before shifting (grouped by station_id/paikka_id and drought index)
setorder(station_sgi_w_dr_inds, station_id, dr_ind, time)
setorder(pipe_sgi_w_dr_inds, paikka_id, dr_ind, time)

#  Create the multiple shifted columns
temp_st_wide_lag_inds <- data.table::copy(station_sgi_w_dr_inds) # do not modif orig
temp_st_wide_lag_inds[, (lag_cols) := shift(value, lags), by=list(station_id, dr_ind)]

temp_p_wide_lag_inds <- data.table::copy(pipe_sgi_w_dr_inds) # do not modif orig
temp_p_wide_lag_inds[, (lag_cols) := shift(value, lags), by=list(station_id, dr_ind)]

# Shifted cols into long format. Kept cols are in `id.vars`
station_lag_inds <- melt(temp_st_wide_lag_inds,
  id.vars = c("station_id", "time", "dr_ind","ind_type","ind_accum", "value", "sgi"), 
  measure.vars = lag_cols,
  variable.name = "lag_months", value.name = "lagged_value"
)
pipe_lag_inds <- melt(temp_p_wide_lag_inds,
  id.vars = c("paikka_id","station_id", "time", "dr_ind","ind_type","ind_accum", "value", "sgi"), 
  measure.vars = lag_cols,
  variable.name = "lag_months", value.name = "lagged_value"
)
rm(temp_st_wide_lag_inds, temp_p_wide_lag_inds) # Cleanup temp vars
# Convert `lag_months` to contain only the numeric lag amount
station_lag_inds[, lag_months := as.integer(gsub("value_lag_", "", lag_months))]
pipe_lag_inds[,    lag_months := as.integer(gsub("value_lag_", "", lag_months))]

Calculate the Pearson’s correlation coefficients between all the drought indices and the SGI time series. This is done on both the station and the pipe level. The best-correlating combinations of drought index accumulation period and temporal lag are also saved.

# Function to re-format cor.test output into 1-long lists (output will stay as 1 row when applying in dt)
dt_cor.test <- function(...) {
  res_list <- as.list(stats::cor.test(...))
  res_list[["conf.int.lo"]] <- res_list[["conf.int"]][[1]]
  res_list[["conf.int.hi"]] <- res_list[["conf.int"]][[2]]
  res_list[["conf.int"]] <- NULL;  stopifnot(lapply(res_list, length) == 1L)
  return(res_list)
}
# Calculate correlations on pipe and station level
pipe_cor <- pipe_lag_inds[, dt_cor.test(sgi, lagged_value, method="pear"),
                                by=list(paikka_id,station_id, dr_ind,ind_type,ind_accum, lag_months)]
station_cor <- station_lag_inds[, dt_cor.test(sgi, lagged_value, method="pear"),
                                by=list(station_id, dr_ind,ind_type,ind_accum, lag_months)]

# SGI-droughtindex correlation plots
pipe_cor |> 
  ggplot(aes(ind_type, estimate)) +
      geom_violin() +
      ggforce::geom_sina(aes(alpha=p.value <= 0.01, group=dr_ind), show.legend=F) +
      facet_grid(rows=vars(ind_accum), cols=vars(lag_months), scales="free_x") +
      stat_summary(fun.data = "mean_se", color="red")

Warning: Using alpha for a discrete variable is not advised.

station_cor |> 
  ggplot(aes(ind_type, estimate, group=interaction(ind_type,ind_accum))) +
      geom_violin() +
      ggforce::geom_sina(aes(alpha=p.value <= 0.01, group=dr_ind)) +
      facet_grid(cols = vars(ind_accum), scales="free_x") +
      stat_summary(fun.data = "mean_se", color="red")

Warning: Using alpha for a discrete variable is not advised.

      # geom_point(aes(alpha=p.value <= 0.01))


# Summarize all stations: what lag + accum combo is the best for each station & index
station_cor_smry <- station_cor[, .SD[which.max(estimate)],
                                    by=list(ind_type, station_id)]
station_cor_smry[,stopifnot(!anyDuplicated(station_id)), by=ind_type] # validate

Empty data.table (0 rows and 1 cols): ind_type

# Summarize all pipes
pipe_cor_smry <- pipe_cor[, .SD[which.max(estimate)],
                                    by=list(ind_type, paikka_id,station_id)]
pipe_cor_smry[,stopifnot(!anyDuplicated(paikka_id)), by=ind_type] # validate

Empty data.table (0 rows and 1 cols): ind_type

# Plot the correlations heatmap of 1 station
ggplot(station_cor[station_id=="0101"],
    aes(lag_months, factor(ind_accum), fill=estimate)) +
  geom_tile() +
  geom_vline(xintercept=0, color="black") + 
  geom_point(data=station_cor_smry[station_id=="0101"], color="white", size=3, pch="cross", stroke=1) +
  facet_grid(rows=vars(ind_type), cols=vars(station_id)) +
  scico::scale_fill_scico("Correlation", palette="vik", midpoint=0) +
  coord_cartesian(expand=F)

# and pipes of 1 stations
ggplot(pipe_cor[station_id=="0902"],
    aes(lag_months, factor(ind_accum), fill=estimate)) +
  geom_tile() +
  geom_vline(xintercept=0, color="black") + 
  stat_summary(data=pipe_cor_smry[station_id=="0902"], fun=max, color="white") +
  facet_grid(rows=vars(ind_type), cols=vars(paikka_id)) +
  scico::scale_fill_scico("Correlation", palette="vik", midpoint=0) +
  coord_cartesian(expand=F)

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).

# Plot summary of all stations
dplyr::group_by(station_cor_smry, ind_type) |> 
  dplyr::count(lag_months, ind_accum) |> 
    ggplot(aes(lag_months, ind_accum)) +
    geom_tile(aes(fill=n)) + 
    scale_fill_viridis_c("Stations", transform="sqrt") +
    geom_label(aes(label=n), alpha=0.5) +
    facet_grid(rows=vars(ind_type)) +
    scale_x_continuous(breaks=~round(unique(pretty(.)))) +
    theme_bw()

# Compare also which index is better overall (plot best correlations for all indices)
ggplot(station_cor_smry, aes(ind_type, estimate)) + 
  geom_violin(draw_quantiles = c(0.5), lwd=0.8) + ggforce::geom_sina(alpha=0.2) + theme_minimal()

# Plot summary of all pipes
dplyr::group_by(pipe_cor_smry, ind_type) |> 
  dplyr::count(lag_months, ind_accum) |> 
    ggplot(aes(lag_months, ind_accum)) +
    geom_tile(aes(fill=n)) + 
    scale_fill_viridis_c("Pipes", transform="sqrt") +
    geom_label(aes(label=n), alpha=0.5) +
    facet_grid(rows=vars(ind_type)) +
    scale_x_continuous(breaks=~round(unique(pretty(.)))) +
    theme_bw()

ggplot(pipe_cor_smry, aes(ind_type, estimate)) + 
  geom_violin(draw_quantiles=c(0.5), lwd=0.8) + ggforce::geom_sina(alpha=0.2) + theme_minimal()

# For some reasons these options give confusing results...
# heatmap(table(station_cor_smry[,.(lag_months,ind_accum)]))
# plot(table(station_cor_smry[,.(lag_months,ind_accum)]))
# Alt to `table`: xtabs(~ lag_months + ind_accum, data = station_cor_smry)

Calculate the RMSE between all the drought indices and the SGI time series. This is done on both the station and the pipe level. Do a similar summarization of the best combinations.

pipe_rmse <- pipe_lag_inds[, .(RMSE=DescTools::RMSE(lagged_value, sgi, na.rm=T)), # SGI is ref
    by=list(paikka_id,station_id, dr_ind,ind_type,ind_accum, lag_months)]
# Plot RMSEs of all combinations
pipe_rmse |> 
  ggplot(aes(ind_type, RMSE, group=interaction(ind_type,ind_accum))) +
    geom_violin() +
    ggforce::geom_sina(aes(group=dr_ind)) +
    facet_grid(cols = vars(ind_accum), scales="free_x") +
    stat_summary(fun.data = "mean_se", color="red")

# Summarize RMSE of pipes (which lag + accum combo gives lowest value, for each pipe and index)
pipe_rmse_smry <- pipe_rmse[, .SD[which.min(RMSE)],
                                    by=list(ind_type, paikka_id)]# Plot RMSE development on 1 station

ggplot(pipe_rmse[station_id=="0902"],
    aes(lag_months, factor(ind_accum), fill=RMSE)) +
  geom_tile() +
  geom_vline(xintercept=0, color="black") + 
  geom_point(data=pipe_rmse_smry[station_id=="0902"], color="white", size=3, pch="cross", stroke=1) +
  facet_grid(rows=vars(ind_type), cols=vars(paikka_id)) +
  scico::scale_fill_scico("RMSE", palette="batlow") +
  coord_cartesian(expand=F)

# Plot summary of the best RMSEs of pipes
dplyr::group_by(pipe_rmse_smry, ind_type) |> 
  dplyr::count(lag_months, ind_accum) |> 
    ggplot(aes(lag_months, ind_accum)) +
    geom_tile(aes(fill=n)) + 
    scale_fill_viridis_c("Pipes", transform="sqrt") +
    geom_label(aes(label=n), alpha=0.5) +
    facet_grid(rows=vars(ind_type)) +
    scale_x_continuous(breaks=~round(unique(pretty(.)))) +
    theme_bw()

1 Auto- and cross-correlations

Look into how the SGI is autocorrelated, and how the SGI cross-correlates with other indices.

acf(pipe_sgi_long[paikka_id == 10550, sgi], na.action = na.pass)

pipe_sgi_long[paikka_id %in% unique(paikka_id)[1:3]][, print(acf(sgi, na.action = na.pass)), by=paikka_id]


Autocorrelations of series 'sgi', by lag

    0     1     2     3     4     5     6     7     8     9    10    11    12 
1.000 0.625 0.401 0.279 0.253 0.206 0.160 0.146 0.150 0.157 0.167 0.156 0.174 
   13    14    15    16    17    18    19    20    21    22    23    24    25 
0.125 0.077 0.066 0.089 0.099 0.119 0.127 0.095 0.055 0.116 0.169 0.186 0.091 
   26    27    28 
0.019 0.021 0.034


Autocorrelations of series 'sgi', by lag

    0     1     2     3     4     5     6     7     8     9    10    11    12 
1.000 0.785 0.649 0.551 0.491 0.503 0.493 0.460 0.463 0.451 0.409 0.375 0.358 
   13    14    15    16    17    18    19    20    21    22    23    24    25 
0.329 0.337 0.319 0.307 0.295 0.286 0.252 0.250 0.211 0.190 0.182 0.171 0.169 
   26    27    28 
0.155 0.109 0.127


Autocorrelations of series 'sgi', by lag

     0      1      2      3      4      5      6      7      8      9     10 
 1.000  0.616  0.409  0.280  0.227  0.198  0.159  0.116  0.120  0.110  0.083 
    11     12     13     14     15     16     17     18     19     20     21 
 0.047  0.049 -0.039 -0.032 -0.024 -0.031 -0.013 -0.008 -0.019 -0.009 -0.053 
    22     23     24     25     26     27     28 
-0.081 -0.103 -0.059 -0.072 -0.032 -0.066 -0.059

Empty data.table (0 rows and 1 cols): paikka_id

ccf_test_spei <- station_spi_spei_ts[station_id == pipes[paikka_id == 10550, asema_tunnus] &
  dr_ind == "SPEI12"
]
ccf_test_pipe <- pipe_sgi_long[paikka_id == 10550]
ccf_test <- merge(ccf_test_spei, ccf_test_pipe, by="time")

ccf(ccf_test$sgi, ccf_test$value, na.action = na.pass, lag.max=36)

stats::cor.test(ccf_test$sgi, ccf_test$value,  method="pearson")


    Pearson's product-moment correlation

data:  ccf_test$sgi and ccf_test$value
t = 23.261, df = 594, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6459293 0.7302459
sample estimates:
      cor 
0.6904253

plot(ccf_test$sgi, ccf_test$value)

2 TODO

How much does the correlation worsen when looking further into the future?
calculate mean unsaturated zone thickness, look at the results based on that
investigate if there is a spatial pattern to the differing responses to optimal lag+accumulation combination
investigate how the absolute GW level fluctuation amplitude is tied to the SGI response
use ImeytymisKerroin of GWAs of stations to look at the results?
investigate the ranges at which the SGI ACF decays (m_max) and how that impacts the results
- look at how the ACF is tied to the optimal (highest SGI cor.) P accumulation of the sites
calculate max & median drought durations from both SGI and the indices.
- look at how they are cross-correlated
- look at which of them show comparatively more drought periods over the whole data
- look at how they are tied to the ACF decay, and to the unsat.zone thickness
calculate correlations of SGI and dr-indices only with <0 values. (seems to have worse corr.)
- also look at whether the unsat.zone thickness explains a site’s discrepency between >0 and <0 correlations