Skip to content

Commit

Permalink
share of missing weights option to nuts_convert and `nuts_aggregate…
Browse files Browse the repository at this point in the history
…`, elaborate internatl data in vignette
  • Loading branch information
AAoritz committed Mar 12, 2024
1 parent d81c99c commit 5af7b8f
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 18 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ importFrom(dplyr,if_any)
importFrom(dplyr,inner_join)
importFrom(dplyr,join_by)
importFrom(dplyr,left_join)
importFrom(dplyr,matches)
importFrom(dplyr,mutate)
importFrom(dplyr,mutate_at)
importFrom(dplyr,n)
Expand Down
1 change: 1 addition & 0 deletions R/nuts-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#' @importFrom dplyr relocate
#' @importFrom dplyr rename
#' @importFrom dplyr select
#' @importFrom dplyr matches
#' @importFrom dplyr slice
#' @importFrom dplyr summarize
#' @importFrom dplyr summarise
Expand Down
23 changes: 23 additions & 0 deletions R/nuts_aggregate.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @param weight String with name of the weight used for conversion. Can be area size `'areaKm'` (default),
#' population in 2011 `'pop11'` or 2018 `'pop18'`, or artificial surfaces in 2012 `'artif_surf12'` and 2018 `'artif_surf18'`.
#' @param missing_rm Boolean that is FALSE by default. TRUE removes regional flows that depart from missing NUTS codes.
#' @param missing_weights_pct Boolean that is FALSE by default. TRUE computes the percentage of missing weights due to missing departing NUTS regions for each variable.
#' @param multiple_versions By default equal to `'error'`, when providing multiple NUTS versions within groups.
#' If set to `'most_frequent'` data is converted using the best-matching NUTS version.
#'
Expand Down Expand Up @@ -41,6 +42,7 @@ nuts_aggregate <- function(data,
variables,
weight = NULL,
missing_rm = FALSE,
missing_weights_pct = FALSE,
multiple_versions = c("error", "most_frequent")) {

# DEFINE CLI DIVs
Expand Down Expand Up @@ -220,8 +222,29 @@ nuts_aggregate <- function(data,
summarise(across(rel_vars, ~ {sum(.x * .data$w, na.rm = missing_rm) / sum(.data$w)})) %>%
ungroup()

# - Compute share of missing weights for each variable
if(missing_weights_pct){
missing_weights_data <- data %>%
mutate(across(c(abs_vars, rel_vars), ~ifelse(is.na(.x), NA_real_, .data$w), .names = "{col}_na_w")) %>%
group_by(pick(c(
"to_code", group_vars
))) %>%
summarise(across(matches("w"), ~ sum(.x, na.rm = TRUE))) %>%
ungroup() %>%
# Compute share of missing weights
mutate(across(matches("_na_w"), ~ 100 - .x / w * 100)) %>%
select(-.data$w)
}

# - Overwrite data with absolute and relative variables at the desired level x group
data <- abs_data %>%
full_join(rel_data, by = c("to_code", group_vars))

# - Add percentage of missing weights
if(missing_weights_pct){
data <- data %>%
full_join(missing_weights_data, by = c("to_code", group_vars))
}
# - Done

# Console Message
Expand Down
57 changes: 54 additions & 3 deletions R/nuts_convert_version.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @param weight String with name of the weight used for conversion. Can be area size `'areaKm'` (default),
#' population in 2011 `'pop11'` or 2018 `'pop18'`, or artificial surfaces in 2012 `'artif_surf12'` and 2018 `'artif_surf18'`.
#' @param missing_rm Boolean that is FALSE by default. TRUE removes regional flows that depart from missing NUTS codes.
#' @param missing_weights_pct Boolean that is FALSE by default. TRUE computes the percentage of missing weights due to missing departing NUTS regions for each variable.
#' @param multiple_versions By default equal to `'error'`, when providing multiple NUTS versions within groups.
#' If set to `'most_frequent'` data is converted using the best-matching NUTS version.
#'
Expand Down Expand Up @@ -58,6 +59,7 @@ nuts_convert_version <-
variables,
weight = NULL,
missing_rm = FALSE,
missing_weights_pct = FALSE,
multiple_versions = c("error", "most_frequent")) {

# DEFINE CLI DIVs
Expand Down Expand Up @@ -214,12 +216,13 @@ nuts_convert_version <-
group_by(pick(c(
"from_code", group_vars
))) %>%
mutate(w = .data$w / sum(.data$w)) %>%
mutate(w_sh = .data$w / sum(.data$w)) %>%
ungroup() %>%
select(-.data$w) %>%
group_by(pick(c(
"to_code", group_vars
))) %>%
summarise(across(abs_vars, ~sum(.x * .data$w, na.rm = missing_rm))) %>%
summarise(across(abs_vars, ~sum(.x * .data$w_sh, na.rm = missing_rm))) %>%
ungroup() %>%
# - Add version
mutate(to_version = to_version) %>%
Expand All @@ -238,10 +241,58 @@ nuts_convert_version <-
mutate(to_version = to_version) %>%
relocate(c("to_code", "to_version"))


# - Compute share of missing weights for each variable
if(missing_weights_pct){
# - For absolute variables
abs_missing_weights <- data %>%
select(-all_of(rel_vars)) %>%
group_by(pick(c(
"from_code", group_vars
))) %>%
mutate(w_sh = .data$w / sum(.data$w)) %>%
ungroup() %>%
select(-.data$w) %>%
# - Create weight for every variable that is missing when variable is missing
mutate(across(abs_vars, ~ifelse(is.na(.x), NA_real_, .data$w_sh), .names = "{col}_na_w")) %>%
group_by(pick(c(
"to_code", group_vars
))) %>%
# - Summarize across weights
summarise(across(matches("w"), sum, na.rm = TRUE)) %>%
ungroup() %>%
# Compute share of missing weights
mutate(across(matches("_na_w"), ~ 100 - .x / w_sh * 100)) %>%
select(-.data$w_sh)

# - For relative variables
rel_missing_weights <- data %>%
select(-all_of(abs_vars)) %>%
mutate(across(rel_vars, ~ifelse(is.na(.x), NA_real_, .data$w), .names = "{col}_na_w")) %>%
group_by(pick(c(
"to_code", group_vars
))) %>%
summarise(across(matches("w"), sum, na.rm = TRUE)) %>%
ungroup() %>%
# Compute share of missing weights
mutate(across(matches("_na_w"), ~ 100 - .x / w * 100)) %>%
select(-.data$w)

}

# Overwrite data with absolute and relative values at to_code x group level
data <- abs_data %>%
full_join(rel_data, by = c("to_code", "to_version", group_vars))
# - done
rm(abs_data, rel_data)

# Add missing weights if requested
if(missing_weights_pct){
data <- data %>%
left_join(abs_missing_weights, by = c("to_code", group_vars)) %>%
left_join(rel_missing_weights, by = c("to_code", group_vars))
rm(abs_missing_weights, rel_missing_weights)
}
# - done

# Console Message
#-----------------
Expand Down
2 changes: 1 addition & 1 deletion docs/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pkgdown: 2.0.7
pkgdown_sha: ~
articles:
nuts: nuts.html
last_built: 2024-03-05T20:41Z
last_built: 2024-03-12T15:01Z
urls:
reference: https://aaoritz.github.io/nuts/reference
article: https://aaoritz.github.io/nuts/articles
Expand Down
3 changes: 3 additions & 0 deletions man/nuts_aggregate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/nuts_convert_version.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 45 additions & 4 deletions tests/testthat/test-nuts_aggregate.R
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ test_that("Additional variables unspecified by the user (here: time)", {
to_level = 1,
variables = c("values" = "absolute",
"pct" = "relative"),
missing_rm = T
missing_rm = TRUE
) %>%
names(.),
c("to_code", "country", "values", "pct")
Expand All @@ -272,7 +272,7 @@ test_that("Feeding multiple NUTS versions within groups", {
manure %>%
filter(nchar(geo) == 5) %>%
select(geo, indic_ag, values) %>%
distinct(geo, .keep_all = T) %>%
distinct(geo, .keep_all = TRUE) %>%
nuts_classify(
nuts_code = "geo",
group_vars = "indic_ag",
Expand All @@ -281,7 +281,7 @@ test_that("Feeding multiple NUTS versions within groups", {
nuts_aggregate(
to_level = 1,
variables = c("values" = "absolute"),
missing_rm = T
missing_rm = TRUE
)
) %>%
grepl("Please make sure...", .),
Expand All @@ -295,7 +295,7 @@ test_that("Feeding multiple NUTS versions within groups. Option most frequent.",
manure %>%
filter(nchar(geo) == 5) %>%
select(geo, indic_ag, values) %>%
distinct(geo, .keep_all = T) %>%
distinct(geo, .keep_all = TRUE) %>%
nuts_classify(
nuts_code = "geo",
group_vars = "indic_ag",
Expand All @@ -311,3 +311,44 @@ test_that("Feeding multiple NUTS versions within groups. Option most frequent.",
c(52, 4)
)
})

test_that("Feeding multiple NUTS versions within groups. Option error.",
{
expect_error(
manure %>%
filter(nchar(geo) == 5) %>%
select(geo, indic_ag, values) %>%
distinct(geo, .keep_all = TRUE) %>%
nuts_classify(
nuts_code = "geo",
group_vars = "indic_ag",
data = .
) %>%
nuts_aggregate(
to_level = 1,
variables = c("values" = "absolute"),
multiple_versions = "error"
),
"Mixed NUTS versions within groups!"
)
})

test_that("Missing NUTS codes, reporting share of missing weights", {
expect_equal(
manure_2_indic() %>%
filter(grepl("DE", geo)) %>%
filter(!grepl("ZZ", geo)) %>%
filter(time %in% c(2000)) %>%
nuts_classify(nuts_code = "geo") %>%
nuts_aggregate(
to_level = 1,
variables = c("values" = "absolute",
"pct" = "relative"),
missing_weights_pct = TRUE
) %>%
names(.),
c("to_code", "country", "values", "pct", "values_na_w", "pct_na_w")
)
})


19 changes: 19 additions & 0 deletions tests/testthat/test-nuts_convert_version.R
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,25 @@ test_that("Missing NUTS codes", {
})


test_that("Missing NUTS codes, reporting share of missing weights per variable", {
expect_equal({
manure_2_indic_DE_2003() %>%
filter(!geo %in% c("DE93", "DED3")) %>%
nuts_classify(nuts_code = "geo",
data = .) %>%
nuts_convert_version(
to_version = "2021",
variables = c("values" = "absolute",
"pct" = "relative"),
missing_weights_pct = TRUE
) %>%
names(.)
}
,
c("to_code", "to_version","country", "values", "pct", "values_na_w", "pct_na_w"))
})


test_that("Ignoring missing NUTS codes", {
expect_equal({
manure_2_indic_DE_2003() %>%
Expand Down
35 changes: 26 additions & 9 deletions vignettes/nuts.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -594,9 +594,9 @@ pat_classified <- nuts_classify(

### Missing NUTS codes

`nuts_classify()` also checks whether the NUTS codes provided are complete. Missing values in the input data will, by default, result in missing values for all affected transformed regions in the output data.
`nuts_classify()` also checks whether the NUTS codes provided are complete (or values of a variable that the user wants to convert are missing for a region). Missing values in the input data will, by default, result in missing values for all affected transformed regions in the output data.

The example below illustrates this case.
The example with Slovenia below illustrates this case.

```{r}
pat_n3_nr_12_si <- pat_n3 %>%
Expand Down Expand Up @@ -627,22 +627,34 @@ nuts_convert_version(
filter(is.na(values))
```

Users have three options to overcome this problem.
Users have the option `missing_weights_pct` to investigate the consequences of missing values in the converted data. Setting the argument to `TRUE` returns a variable that indicates the percentage of missing weights due to missing NUTS codes (or missing values in the variable). The data frame below shows three regions that could not be computed due to missing data. Values in region `SI036` could not be computed since 97.9% of the weights are missing. Values for region `SI037` are missing as well even though only 0.8% of its population-weighted area is missing.

1. The warning can be ignored and the conversion proceeds while returning `NA`s (see above).

2. Users check whether the missing values can be replaced by, e.g., using alternative sources or imputing missing values.
```{r}
nuts_convert_version(
data = pat_classified,
to_version = "2021",
weight = "pop18",
variables = c("values" = "absolute"),
missing_weights_pct = TRUE
) %>%
arrange(desc(values_na_w))
```

Using the the share of missing weights in combination with the option `missing_rm`, the `nuts` package allows to recover some of the missing regions approximately. We can achieve this by setting `missing_rm` to `TRUE`, effectively assuming 0 for missing values. In the next step we remove regions with a high share of missing weights from the output data again. The data frame below shows that values for `SI037` could still be used assuming 0 patents for 0.8% of the missing population-weighted area to construct the region.

3. The argument `missing_rm` can be set to `TRUE`. In this case, missing values will be removed from the input data. Effectively, the interpolation procedures assume that missing values can be replaced with `0`, which may be a very strong assumption depending on the context.

```{r}
nuts_convert_version(
data = pat_classified,
to_version = "2021",
weight = "pop18",
variables = c("values" = "absolute"),
missing_weights_pct = TRUE,
missing_rm = TRUE
) %>%
filter(to_code %in% c("SI031", "SI036", "SI037"))
filter(to_code %in% c("SI031", "SI036", "SI037")) %>%
mutate(values_imp = ifelse(values_na_w < 1, values, NA))
```


Expand Down Expand Up @@ -752,9 +764,14 @@ The changes between the two versions can be summarized as follows:

## Spatial interpolation and conversion tables

To keep track of these changes, the `nuts` package uses regional flows between different NUTS versions. The package ships with a **conversion table** that can be called with `data(cross_walks)` based on [data provided by the JRC](https://urban.jrc.ec.europa.eu/nutsconverter/#/). It documents all boundary changes of NUTS regions.
To keep track of these changes, the `nuts` package uses two data sets:

1. Stocks: data(`all_nuts_codes`) contains **all historical NUTS codes** by NUTS version and country
2. Flows: data(`cross_walks`) contains the **conversion tables** between NUTS versions

They are based on [data provided by the JRC](https://urban.jrc.ec.europa.eu/nutsconverter/#/). Both data sets can also be used by the user manually to explore specific conversion patterns more closely.

For Norway going from version 2016 to 2021, the table looks as follows:
For Norway going from version 2016 to 2021, the `cross_walks` look as follows:

```{r, echo=FALSE, message = FALSE, warning = FALSE}
Expand Down

0 comments on commit 5af7b8f

Please sign in to comment.