commit 700992fbe5fc1e0f405b5102e1ca18c22da1eb05
Author: Pablo Lillo Cea <pablolillocea@pm.me>
Date:   Thu May 7 14:32:28 2026 +0200

    initial commit

diff --git a/CA_municipalities.pdf b/CA_municipalities.pdf
new file mode 100644
index 0000000..27821c6
Binary files /dev/null and b/CA_municipalities.pdf differ
diff --git a/CA_municipalities.qmd b/CA_municipalities.qmd
new file mode 100644
index 0000000..ff3e17f
--- /dev/null
+++ b/CA_municipalities.qmd
@@ -0,0 +1,78 @@
+---
+title: "CAs for the municipalities dataset"
+author: "Clara Comte"
+format: pdf
+editor: visual
+fig-width: 10
+fig-height: 8
+---
+
+```{r}
+#| echo: false
+#| results: hide
+#| message: false
+#| warning: false
+
+library(readxl)
+library(tibble)
+library(FactoMineR)
+library(factoextra)
+library(showtext)
+library(sysfonts)
+
+col_types <- c("numeric", "numeric", "text", rep("numeric", 254))
+municipalities <- read_excel("Municipalities_db.xlsx", col_types = col_types)
+```
+
+## Municipalities and educational levels
+
+Let's do a CA with municipalities and educational levels. In this case, I chose to display two biplots: one with every municipality, without labels for more visibility, and one with labels, but with only the 20 municipalities that contribute the most to the CA.
+
+```{=latex}
+\noindent\
+```
+
+```{r}
+#| echo: false
+#| message: false
+#| warning: false
+
+sysfonts::font_add("CMU Serif", regular = "/home/clara/fonts/cmu/cmunrm.ttf", italic = "/home/clara/fonts/cmu/cmunoti.ttf", bold = "/home/clara/fonts/cmu/cmunrb.ttf", bolditalic = "/home/clara/fonts/cmu/cmunbi.ttf")
+showtext::showtext_auto()
+
+muni_2022 <- municipalities[municipalities$Year == 2022, ]
+tab <- muni_2022[, c("Municipality", "Education level_primary_secondary", "Education level_upper_secondary", "Education level_post-secondary", "Education level_post-graduate")]
+tab <- column_to_rownames(tab, var = "Municipality")
+colnames(tab) <- c("Primary", "Upper secondary", "Post-secondary", "Post-graduate")
+tab <- as.matrix(tab)
+afc <- CA(tab, graph = FALSE)
+
+showtext::showtext_auto()
+showtext::showtext_opts(dpi = 300)
+theme_set(theme_gray(base_family = "CMU Serif"))
+
+fviz_ca_biplot(afc, repel = TRUE, col.col = "red", label = "col", title = "CA of Municipalities and Education Levels (2022): Biplot 1") +
+  theme_bw(base_family = "CMU Serif") +
+  theme(text = element_text(family = "CMU Serif"), axis.title = element_text(family = "CMU Serif"),
+        axis.text = element_text(family = "CMU Serif"), 
+        plot.title = element_text(family = "CMU Serif", hjust = 0.5))
+
+fviz_ca_biplot(afc, repel = TRUE, select.row = list(contrib = 20), col.col = "red",
+               title = "CA of Municipalities and Education Levels (2022): Biplot 1") +
+  theme_bw(base_family = "CMU Serif") +
+  theme(text = element_text(family = "CMU Serif"), axis.title = element_text(family = "CMU Serif"),
+    axis.text = element_text(family = "CMU Serif"),
+    plot.title = element_text(family = "CMU Serif", hjust = 0.5))
+```
+
+The first axis explains 91 % of the total inertia, which makes it highly important in this CA. We can understand it as an educational gradient as it opposes primary and upper secondary levels on the left of the axis, and post-secondary and post-graduate levels to the right. Cities like Stockholm, Uppsala or Lund are highly educated while Eskilstuna or Gislaved are less educated.
+
+The second axis, that explains 5 % of the inertia, seems higly structured by the post-graduate modalities. At the top of the biplot, we find cities such as Uppsala or Lund, known as Sweden's two major university cities, that stand out clearly from all other municipalites. While at the bottom, we find large cities like Stockholm or Malmö that have high post-secondary levels but relatively few post-graduates.
+
+Overall, this graph suggests many things. First, the fact that the axis 1 explains 91 % of the variance shows how the educational gradient is the dominant structural feature of the data. Then, this graph suggests significant inequalities between a minority of highly educated municipalities (Lund, Uppsala, Solna...) and poorly educated cities (Gislaved, Eskilstuna, Norrtälje...).
+
+```{=latex}
+\newpage
+```
+
+## Municipalities and ? (CA n°2)
diff --git a/data/Municipalities_db.xlsx b/data/Municipalities_db.xlsx
new file mode 100644
index 0000000..85953d7
Binary files /dev/null and b/data/Municipalities_db.xlsx differ
diff --git a/data/Municipalities_documentation.docx b/data/Municipalities_documentation.docx
new file mode 100644
index 0000000..ffed765
Binary files /dev/null and b/data/Municipalities_documentation.docx differ
diff --git a/data/processed/m_raw.rds b/data/processed/m_raw.rds
new file mode 100644
index 0000000..5ab58c5
Binary files /dev/null and b/data/processed/m_raw.rds differ
diff --git a/src/municipalities/00-import.R b/src/municipalities/00-import.R
new file mode 100644
index 0000000..7b0d638
--- /dev/null
+++ b/src/municipalities/00-import.R
@@ -0,0 +1,46 @@
+# 00-Libraries -------------------------------------------
+library(tidyverse)
+library(readxl)
+
+# 01-Import ----------------------------------------------
+
+# Municipalities_db.xlsx has two title rows before the merged-cell header:
+# Row 1: "Municipalities Database" (title)
+# Row 2: blank
+# Row 3: category names (forward-filled across sub-columns)
+# Row 4: sub-category names
+# Row 5+: data
+
+header_rows <- read_xlsx(
+  "data/Municipalities_db.xlsx",
+  col_names = FALSE,
+  n_max = 2,
+  skip = 2
+)
+
+col_names <- header_rows |>
+  t() |>
+  as_tibble(.name_repair = "unique") |>
+  set_names(c("category", "subcategory")) |>
+  fill(category) |>
+  mutate(
+    col_name = if_else(
+      is.na(subcategory),
+      category,
+      str_c(category, subcategory, sep = " - ")
+    ) |>
+      str_to_lower() |>
+      str_replace_all("[^a-z0-9]+", "_") |>
+      str_remove("_$")
+  ) |>
+  pull(col_name)
+
+municipalities_raw <- read_xlsx(
+  "data/Municipalities_db.xlsx",
+  skip = 4,
+  col_names = col_names,
+  col_types = "text"
+) |>
+  mutate(across(-c(code, municipality), \(x) suppressWarnings(as.numeric(x))))
+
+write_rds(municipalities_raw, "data/processed/m_raw.rds")
diff --git a/src/municipalities/01-sampling.R b/src/municipalities/01-sampling.R
new file mode 100644
index 0000000..098d6be
--- /dev/null
+++ b/src/municipalities/01-sampling.R
@@ -0,0 +1,2 @@
+# 01-Read ----------------------------------------------------------------------
+municipalities_raw <- read_rds("data/processed/m_raw.rds")