47 lines
1.1 KiB
R
47 lines
1.1 KiB
R
|
|
# 00-Libraries -------------------------------------------
|
||
|
|
library(tidyverse)
|
||
|
|
library(readxl)
|
||
|
|
|
||
|
|
# 01-Import ----------------------------------------------
|
||
|
|
|
||
|
|
# Municipalities_db.xlsx has two title rows before the merged-cell header:
|
||
|
|
# Row 1: "Municipalities Database" (title)
|
||
|
|
# Row 2: blank
|
||
|
|
# Row 3: category names (forward-filled across sub-columns)
|
||
|
|
# Row 4: sub-category names
|
||
|
|
# Row 5+: data
|
||
|
|
|
||
|
|
header_rows <- read_xlsx(
|
||
|
|
"data/Municipalities_db.xlsx",
|
||
|
|
col_names = FALSE,
|
||
|
|
n_max = 2,
|
||
|
|
skip = 2
|
||
|
|
)
|
||
|
|
|
||
|
|
col_names <- header_rows |>
|
||
|
|
t() |>
|
||
|
|
as_tibble(.name_repair = "unique") |>
|
||
|
|
set_names(c("category", "subcategory")) |>
|
||
|
|
fill(category) |>
|
||
|
|
mutate(
|
||
|
|
col_name = if_else(
|
||
|
|
is.na(subcategory),
|
||
|
|
category,
|
||
|
|
str_c(category, subcategory, sep = " - ")
|
||
|
|
) |>
|
||
|
|
str_to_lower() |>
|
||
|
|
str_replace_all("[^a-z0-9]+", "_") |>
|
||
|
|
str_remove("_$")
|
||
|
|
) |>
|
||
|
|
pull(col_name)
|
||
|
|
|
||
|
|
municipalities_raw <- read_xlsx(
|
||
|
|
"data/Municipalities_db.xlsx",
|
||
|
|
skip = 4,
|
||
|
|
col_names = col_names,
|
||
|
|
col_types = "text"
|
||
|
|
) |>
|
||
|
|
mutate(across(-c(code, municipality), \(x) suppressWarnings(as.numeric(x))))
|
||
|
|
|
||
|
|
write_rds(municipalities_raw, "data/processed/m_raw.rds")
|