share of missing weights option to nuts_convert and `nuts_aggregate…

…`, elaborate internatl data in vignette
ropensci · Mar 12, 2024 · 5af7b8f · 5af7b8f
1 parent d81c99c
commit 5af7b8f
Show file tree

Hide file tree

Showing 11 changed files with 177 additions and 18 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ importFrom(dplyr,if_any)
 importFrom(dplyr,inner_join)
 importFrom(dplyr,join_by)
 importFrom(dplyr,left_join)
+importFrom(dplyr,matches)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_at)
 importFrom(dplyr,n)

diff --git a/R/nuts-package.R b/R/nuts-package.R
@@ -23,6 +23,7 @@
 #' @importFrom dplyr relocate
 #' @importFrom dplyr rename
 #' @importFrom dplyr select
+#' @importFrom dplyr matches
 #' @importFrom dplyr slice
 #' @importFrom dplyr summarize
 #' @importFrom dplyr summarise

diff --git a/R/nuts_aggregate.R b/R/nuts_aggregate.R
@@ -8,6 +8,7 @@
 #' @param weight String with name of the weight used for conversion. Can be area size `'areaKm'` (default),
 #' population in 2011 `'pop11'` or 2018 `'pop18'`, or artificial surfaces in 2012 `'artif_surf12'` and 2018 `'artif_surf18'`.
 #' @param missing_rm Boolean that is FALSE by default. TRUE removes regional flows that depart from missing NUTS codes.
+#' @param missing_weights_pct Boolean that is FALSE by default. TRUE computes the percentage of missing weights due to missing departing NUTS regions for each variable.
 #' @param multiple_versions By default equal to `'error'`, when providing multiple NUTS versions within groups.
 #' If set to `'most_frequent'` data is converted using the best-matching NUTS version.
 #'
@@ -41,6 +42,7 @@ nuts_aggregate <- function(data,
                            variables,
                            weight = NULL,
                            missing_rm = FALSE,
+                           missing_weights_pct = FALSE,
                            multiple_versions = c("error", "most_frequent")) {
 
     # DEFINE CLI DIVs
@@ -220,8 +222,29 @@ nuts_aggregate <- function(data,
       summarise(across(rel_vars, ~ {sum(.x * .data$w, na.rm = missing_rm) / sum(.data$w)})) %>%
       ungroup()
 
+    # - Compute share of missing weights for each variable
+    if(missing_weights_pct){
+      missing_weights_data <- data %>%
+        mutate(across(c(abs_vars, rel_vars), ~ifelse(is.na(.x), NA_real_,  .data$w), .names = "{col}_na_w")) %>%
+        group_by(pick(c(
+          "to_code", group_vars
+        ))) %>%
+        summarise(across(matches("w"), ~ sum(.x, na.rm = TRUE))) %>%
+        ungroup() %>%
+        # Compute share of missing weights
+        mutate(across(matches("_na_w"), ~ 100 - .x / w * 100)) %>%
+        select(-.data$w)
+      }
+
+    # - Overwrite data with absolute and relative variables at the desired level x group
     data <- abs_data %>%
       full_join(rel_data, by = c("to_code", group_vars))
+
+    # - Add percentage of missing weights
+    if(missing_weights_pct){
+      data <- data %>%
+        full_join(missing_weights_data, by = c("to_code", group_vars))
+    }
     # - Done
 
     # Console Message

diff --git a/R/nuts_convert_version.R b/R/nuts_convert_version.R
@@ -8,6 +8,7 @@
 #' @param weight String with name of the weight used for conversion. Can be area size `'areaKm'` (default),
 #' population in 2011 `'pop11'` or 2018 `'pop18'`, or artificial surfaces in 2012 `'artif_surf12'` and 2018 `'artif_surf18'`.
 #' @param missing_rm Boolean that is FALSE by default. TRUE removes regional flows that depart from missing NUTS codes.
+#' @param missing_weights_pct Boolean that is FALSE by default. TRUE computes the percentage of missing weights due to missing departing NUTS regions for each variable.
 #' @param multiple_versions By default equal to `'error'`, when providing multiple NUTS versions within groups.
 #' If set to `'most_frequent'` data is converted using the best-matching NUTS version.
 #'
@@ -58,6 +59,7 @@ nuts_convert_version <-
            variables,
            weight = NULL,
            missing_rm = FALSE,
+           missing_weights_pct = FALSE,
            multiple_versions = c("error", "most_frequent")) {
 
     # DEFINE CLI DIVs
@@ -214,12 +216,13 @@ nuts_convert_version <-
       group_by(pick(c(
         "from_code", group_vars
       ))) %>%
-      mutate(w = .data$w / sum(.data$w)) %>%
+      mutate(w_sh = .data$w / sum(.data$w)) %>%
       ungroup() %>%
+      select(-.data$w) %>%
       group_by(pick(c(
         "to_code", group_vars
       ))) %>%
-      summarise(across(abs_vars, ~sum(.x * .data$w, na.rm = missing_rm))) %>%
+      summarise(across(abs_vars, ~sum(.x * .data$w_sh, na.rm = missing_rm))) %>%
       ungroup() %>%
       # - Add version
       mutate(to_version = to_version) %>%
@@ -238,10 +241,58 @@ nuts_convert_version <-
       mutate(to_version = to_version) %>%
       relocate(c("to_code", "to_version"))
 
+
+    # - Compute share of missing weights for each variable
+    if(missing_weights_pct){
+      # - For absolute variables
+      abs_missing_weights <- data %>%
+        select(-all_of(rel_vars)) %>%
+        group_by(pick(c(
+          "from_code", group_vars
+        ))) %>%
+        mutate(w_sh = .data$w / sum(.data$w)) %>%
+        ungroup() %>%
+        select(-.data$w) %>%
+        # - Create weight for every variable that is missing when variable is missing
+        mutate(across(abs_vars, ~ifelse(is.na(.x), NA_real_,  .data$w_sh), .names = "{col}_na_w")) %>%
+        group_by(pick(c(
+          "to_code", group_vars
+        ))) %>%
+        # - Summarize across weights
+        summarise(across(matches("w"), sum, na.rm = TRUE)) %>%
+        ungroup() %>%
+        # Compute share of missing weights
+        mutate(across(matches("_na_w"), ~ 100 - .x / w_sh * 100)) %>%
+        select(-.data$w_sh)
+
+      # - For relative variables
+      rel_missing_weights <- data %>%
+        select(-all_of(abs_vars)) %>%
+        mutate(across(rel_vars, ~ifelse(is.na(.x), NA_real_,  .data$w), .names = "{col}_na_w")) %>%
+        group_by(pick(c(
+          "to_code", group_vars
+        ))) %>%
+        summarise(across(matches("w"), sum, na.rm = TRUE)) %>%
+        ungroup() %>%
+        # Compute share of missing weights
+        mutate(across(matches("_na_w"), ~ 100 - .x / w * 100)) %>%
+        select(-.data$w)
+
+    }
+
+    # Overwrite data with absolute and relative values at to_code x group level
     data <- abs_data %>%
       full_join(rel_data, by = c("to_code", "to_version", group_vars))
-    # - done
+    rm(abs_data, rel_data)
 
+    # Add missing weights if requested
+    if(missing_weights_pct){
+      data <- data %>%
+        left_join(abs_missing_weights, by = c("to_code", group_vars)) %>%
+        left_join(rel_missing_weights, by = c("to_code", group_vars))
+      rm(abs_missing_weights, rel_missing_weights)
+    }
+    # - done
 
     # Console Message
     #-----------------

diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -3,7 +3,7 @@ pkgdown: 2.0.7
 pkgdown_sha: ~
 articles:
   nuts: nuts.html
-last_built: 2024-03-05T20:41Z
+last_built: 2024-03-12T15:01Z
 urls:
   reference: https://aaoritz.github.io/nuts/reference
   article: https://aaoritz.github.io/nuts/articles

diff --git a/man/nuts_aggregate.Rd b/man/nuts_aggregate.Rd
diff --git a/man/nuts_convert_version.Rd b/man/nuts_convert_version.Rd
diff --git a/tests/testthat/test-nuts_aggregate.R b/tests/testthat/test-nuts_aggregate.R
@@ -259,7 +259,7 @@ test_that("Additional variables unspecified by the user (here: time)", {
         to_level = 1,
         variables = c("values" = "absolute",
                       "pct" = "relative"),
-        missing_rm = T
+        missing_rm = TRUE
       ) %>%
       names(.),
     c("to_code", "country", "values", "pct")
@@ -272,7 +272,7 @@ test_that("Feeding multiple NUTS versions within groups", {
       manure %>%
         filter(nchar(geo) == 5) %>%
         select(geo, indic_ag, values) %>%
-        distinct(geo,  .keep_all = T) %>%
+        distinct(geo,  .keep_all = TRUE) %>%
         nuts_classify(
           nuts_code = "geo",
           group_vars = "indic_ag",
@@ -281,7 +281,7 @@ test_that("Feeding multiple NUTS versions within groups", {
         nuts_aggregate(
           to_level = 1,
           variables = c("values" = "absolute"),
-          missing_rm = T
+          missing_rm = TRUE
         )
     ) %>%
       grepl("Please make sure...", .),
@@ -295,7 +295,7 @@ test_that("Feeding multiple NUTS versions within groups. Option most frequent.",
               manure %>%
                 filter(nchar(geo) == 5) %>%
                 select(geo, indic_ag, values) %>%
-                distinct(geo,  .keep_all = T) %>%
+                distinct(geo,  .keep_all = TRUE) %>%
                 nuts_classify(
                   nuts_code = "geo",
                   group_vars = "indic_ag",
@@ -311,3 +311,44 @@ test_that("Feeding multiple NUTS versions within groups. Option most frequent.",
               c(52, 4)
             )
           })
+
+test_that("Feeding multiple NUTS versions within groups. Option error.",
+          {
+            expect_error(
+              manure %>%
+                filter(nchar(geo) == 5) %>%
+                select(geo, indic_ag, values) %>%
+                distinct(geo,  .keep_all = TRUE) %>%
+                nuts_classify(
+                  nuts_code = "geo",
+                  group_vars = "indic_ag",
+                  data = .
+                ) %>%
+                nuts_aggregate(
+                  to_level = 1,
+                  variables = c("values" = "absolute"),
+                  multiple_versions = "error"
+                ),
+              "Mixed NUTS versions within groups!"
+            )
+          })
+
+test_that("Missing NUTS codes, reporting share of missing weights", {
+  expect_equal(
+    manure_2_indic() %>%
+      filter(grepl("DE", geo)) %>%
+      filter(!grepl("ZZ", geo)) %>%
+      filter(time %in% c(2000)) %>%
+      nuts_classify(nuts_code = "geo") %>%
+      nuts_aggregate(
+        to_level = 1,
+        variables = c("values" = "absolute",
+                      "pct" = "relative"),
+        missing_weights_pct = TRUE
+      ) %>%
+      names(.),
+    c("to_code", "country", "values", "pct", "values_na_w", "pct_na_w")
+  )
+})
+
+
diff --git a/tests/testthat/test-nuts_convert_version.R b/tests/testthat/test-nuts_convert_version.R
@@ -209,6 +209,25 @@ test_that("Missing NUTS codes", {
 })
 
 
+test_that("Missing NUTS codes, reporting share of missing weights per variable", {
+  expect_equal({
+    manure_2_indic_DE_2003()  %>%
+      filter(!geo %in% c("DE93", "DED3")) %>%
+      nuts_classify(nuts_code = "geo",
+                    data = .) %>%
+      nuts_convert_version(
+        to_version = "2021",
+        variables = c("values" = "absolute",
+                      "pct" = "relative"),
+        missing_weights_pct = TRUE
+      ) %>%
+      names(.)
+  }
+  ,
+  c("to_code", "to_version","country", "values", "pct", "values_na_w", "pct_na_w"))
+})
+
+
 test_that("Ignoring missing NUTS codes", {
   expect_equal({
     manure_2_indic_DE_2003()  %>%

diff --git a/vignettes/nuts.Rmd b/vignettes/nuts.Rmd
@@ -594,9 +594,9 @@ pat_classified <- nuts_classify(
 
 ### Missing NUTS codes
 
-`nuts_classify()` also checks whether the NUTS codes provided are complete. Missing values in the input data will, by default, result in missing values for all affected transformed regions in the output data. 
+`nuts_classify()` also checks whether the NUTS codes provided are complete (or values of a variable that the user wants to convert are missing for a region). Missing values in the input data will, by default, result in missing values for all affected transformed regions in the output data. 
 
-The example below illustrates this case.
+The example with Slovenia below illustrates this case.
 
 ```{r}
 pat_n3_nr_12_si <- pat_n3 %>%
@@ -627,22 +627,34 @@ nuts_convert_version(
   filter(is.na(values))
 ```
 
-Users have three options to overcome this problem.
+Users have the option `missing_weights_pct` to investigate the consequences of missing values in the converted data. Setting the argument to `TRUE` returns a variable that indicates the percentage of missing weights due to missing NUTS codes (or missing values in the variable). The data frame below shows three regions that could not be computed due to missing data. Values in region `SI036` could not be computed since 97.9% of the weights are missing. Values for region `SI037` are missing as well even though only 0.8% of its population-weighted area is missing. 
 
-  1. The warning can be ignored and the conversion proceeds while returning `NA`s (see above).
 
-  2. Users check whether the missing values can be replaced by, e.g., using alternative sources or imputing missing values.
+```{r}
+nuts_convert_version(
+  data = pat_classified, 
+  to_version = "2021", 
+  weight = "pop18",
+  variables = c("values" = "absolute"),
+  missing_weights_pct = TRUE
+  ) %>% 
+  arrange(desc(values_na_w))
+```
+
+Using the the share of missing weights in combination with the option `missing_rm`, the `nuts` package allows to recover some of the missing regions approximately. We can achieve this by setting `missing_rm` to `TRUE`, effectively assuming 0 for missing values. In the next step we remove regions with a high share of missing weights from the output data again. The data frame below shows that values for `SI037` could still be used assuming 0 patents for 0.8% of the missing population-weighted area to construct the region. 
 
-  3. The argument `missing_rm` can be set to `TRUE`. In this case, missing values will be removed from the input data. Effectively, the interpolation procedures assume that missing values can be replaced with `0`, which may be a very strong assumption depending on the context.
 
 ```{r}
 nuts_convert_version(
   data = pat_classified, 
   to_version = "2021", 
+  weight = "pop18",
   variables = c("values" = "absolute"),
+  missing_weights_pct = TRUE,
   missing_rm = TRUE
   ) %>% 
-  filter(to_code %in% c("SI031", "SI036", "SI037"))
+  filter(to_code %in% c("SI031", "SI036", "SI037")) %>% 
+  mutate(values_imp = ifelse(values_na_w < 1, values, NA))
 ```
 
 
@@ -752,9 +764,14 @@ The changes between the two versions can be summarized as follows:
 
 ## Spatial interpolation and conversion tables
 
-To keep track of these changes, the `nuts` package uses regional flows between different NUTS versions. The package ships with a **conversion table** that can be called with `data(cross_walks)` based on [data provided by the JRC](https://urban.jrc.ec.europa.eu/nutsconverter/#/). It documents all boundary changes of NUTS regions.
+To keep track of these changes, the `nuts` package uses two data sets: 
+
+1. Stocks: data(`all_nuts_codes`) contains **all historical NUTS codes** by NUTS version and country
+2. Flows: data(`cross_walks`) contains the **conversion tables** between NUTS versions
+
+They are based on [data provided by the JRC](https://urban.jrc.ec.europa.eu/nutsconverter/#/). Both data sets can also be used by the user manually to explore specific conversion patterns more closely. 
 
-For Norway going from version 2016 to 2021, the table looks as follows:
+For Norway going from version 2016 to 2021, the `cross_walks` look as follows:
 
 ```{r,  echo=FALSE, message = FALSE, warning = FALSE}