initial commit
This commit is contained in:
commit
700992fbe5
7 changed files with 126 additions and 0 deletions
BIN
CA_municipalities.pdf
Normal file
BIN
CA_municipalities.pdf
Normal file
Binary file not shown.
78
CA_municipalities.qmd
Normal file
78
CA_municipalities.qmd
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
---
|
||||||
|
title: "CAs for the municipalities dataset"
|
||||||
|
author: "Clara Comte"
|
||||||
|
format: pdf
|
||||||
|
editor: visual
|
||||||
|
fig-width: 10
|
||||||
|
fig-height: 8
|
||||||
|
---
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
#| echo: false
|
||||||
|
#| results: hide
|
||||||
|
#| message: false
|
||||||
|
#| warning: false
|
||||||
|
|
||||||
|
library(readxl)
|
||||||
|
library(tibble)
|
||||||
|
library(FactoMineR)
|
||||||
|
library(factoextra)
|
||||||
|
library(showtext)
|
||||||
|
library(sysfonts)
|
||||||
|
|
||||||
|
col_types <- c("numeric", "numeric", "text", rep("numeric", 254))
|
||||||
|
municipalities <- read_excel("Municipalities_db.xlsx", col_types = col_types)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Municipalities and educational levels
|
||||||
|
|
||||||
|
Let's do a CA with municipalities and educational levels. In this case, I chose to display two biplots: one with every municipality, without labels for more visibility, and one with labels, but with only the 20 municipalities that contribute the most to the CA.
|
||||||
|
|
||||||
|
```{=latex}
|
||||||
|
\noindent\
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
#| echo: false
|
||||||
|
#| message: false
|
||||||
|
#| warning: false
|
||||||
|
|
||||||
|
sysfonts::font_add("CMU Serif", regular = "/home/clara/fonts/cmu/cmunrm.ttf", italic = "/home/clara/fonts/cmu/cmunoti.ttf", bold = "/home/clara/fonts/cmu/cmunrb.ttf", bolditalic = "/home/clara/fonts/cmu/cmunbi.ttf")
|
||||||
|
showtext::showtext_auto()
|
||||||
|
|
||||||
|
muni_2022 <- municipalities[municipalities$Year == 2022, ]
|
||||||
|
tab <- muni_2022[, c("Municipality", "Education level_primary_secondary", "Education level_upper_secondary", "Education level_post-secondary", "Education level_post-graduate")]
|
||||||
|
tab <- column_to_rownames(tab, var = "Municipality")
|
||||||
|
colnames(tab) <- c("Primary", "Upper secondary", "Post-secondary", "Post-graduate")
|
||||||
|
tab <- as.matrix(tab)
|
||||||
|
afc <- CA(tab, graph = FALSE)
|
||||||
|
|
||||||
|
showtext::showtext_auto()
|
||||||
|
showtext::showtext_opts(dpi = 300)
|
||||||
|
theme_set(theme_gray(base_family = "CMU Serif"))
|
||||||
|
|
||||||
|
fviz_ca_biplot(afc, repel = TRUE, col.col = "red", label = "col", title = "CA of Municipalities and Education Levels (2022): Biplot 1") +
|
||||||
|
theme_bw(base_family = "CMU Serif") +
|
||||||
|
theme(text = element_text(family = "CMU Serif"), axis.title = element_text(family = "CMU Serif"),
|
||||||
|
axis.text = element_text(family = "CMU Serif"),
|
||||||
|
plot.title = element_text(family = "CMU Serif", hjust = 0.5))
|
||||||
|
|
||||||
|
fviz_ca_biplot(afc, repel = TRUE, select.row = list(contrib = 20), col.col = "red",
|
||||||
|
title = "CA of Municipalities and Education Levels (2022): Biplot 1") +
|
||||||
|
theme_bw(base_family = "CMU Serif") +
|
||||||
|
theme(text = element_text(family = "CMU Serif"), axis.title = element_text(family = "CMU Serif"),
|
||||||
|
axis.text = element_text(family = "CMU Serif"),
|
||||||
|
plot.title = element_text(family = "CMU Serif", hjust = 0.5))
|
||||||
|
```
|
||||||
|
|
||||||
|
The first axis explains 91 % of the total inertia, which makes it highly important in this CA. We can understand it as an educational gradient as it opposes primary and upper secondary levels on the left of the axis, and post-secondary and post-graduate levels to the right. Cities like Stockholm, Uppsala or Lund are highly educated while Eskilstuna or Gislaved are less educated.
|
||||||
|
|
||||||
|
The second axis, that explains 5 % of the inertia, seems higly structured by the post-graduate modalities. At the top of the biplot, we find cities such as Uppsala or Lund, known as Sweden's two major university cities, that stand out clearly from all other municipalites. While at the bottom, we find large cities like Stockholm or Malmö that have high post-secondary levels but relatively few post-graduates.
|
||||||
|
|
||||||
|
Overall, this graph suggests many things. First, the fact that the axis 1 explains 91 % of the variance shows how the educational gradient is the dominant structural feature of the data. Then, this graph suggests significant inequalities between a minority of highly educated municipalities (Lund, Uppsala, Solna...) and poorly educated cities (Gislaved, Eskilstuna, Norrtälje...).
|
||||||
|
|
||||||
|
```{=latex}
|
||||||
|
\newpage
|
||||||
|
```
|
||||||
|
|
||||||
|
## Municipalities and ? (CA n°2)
|
||||||
BIN
data/Municipalities_db.xlsx
Normal file
BIN
data/Municipalities_db.xlsx
Normal file
Binary file not shown.
BIN
data/Municipalities_documentation.docx
Normal file
BIN
data/Municipalities_documentation.docx
Normal file
Binary file not shown.
BIN
data/processed/m_raw.rds
Normal file
BIN
data/processed/m_raw.rds
Normal file
Binary file not shown.
46
src/municipalities/00-import.R
Normal file
46
src/municipalities/00-import.R
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
# 00-Libraries -------------------------------------------
|
||||||
|
library(tidyverse)
|
||||||
|
library(readxl)
|
||||||
|
|
||||||
|
# 01-Import ----------------------------------------------
|
||||||
|
|
||||||
|
# Municipalities_db.xlsx has two title rows before the merged-cell header:
|
||||||
|
# Row 1: "Municipalities Database" (title)
|
||||||
|
# Row 2: blank
|
||||||
|
# Row 3: category names (forward-filled across sub-columns)
|
||||||
|
# Row 4: sub-category names
|
||||||
|
# Row 5+: data
|
||||||
|
|
||||||
|
header_rows <- read_xlsx(
|
||||||
|
"data/Municipalities_db.xlsx",
|
||||||
|
col_names = FALSE,
|
||||||
|
n_max = 2,
|
||||||
|
skip = 2
|
||||||
|
)
|
||||||
|
|
||||||
|
col_names <- header_rows |>
|
||||||
|
t() |>
|
||||||
|
as_tibble(.name_repair = "unique") |>
|
||||||
|
set_names(c("category", "subcategory")) |>
|
||||||
|
fill(category) |>
|
||||||
|
mutate(
|
||||||
|
col_name = if_else(
|
||||||
|
is.na(subcategory),
|
||||||
|
category,
|
||||||
|
str_c(category, subcategory, sep = " - ")
|
||||||
|
) |>
|
||||||
|
str_to_lower() |>
|
||||||
|
str_replace_all("[^a-z0-9]+", "_") |>
|
||||||
|
str_remove("_$")
|
||||||
|
) |>
|
||||||
|
pull(col_name)
|
||||||
|
|
||||||
|
municipalities_raw <- read_xlsx(
|
||||||
|
"data/Municipalities_db.xlsx",
|
||||||
|
skip = 4,
|
||||||
|
col_names = col_names,
|
||||||
|
col_types = "text"
|
||||||
|
) |>
|
||||||
|
mutate(across(-c(code, municipality), \(x) suppressWarnings(as.numeric(x))))
|
||||||
|
|
||||||
|
write_rds(municipalities_raw, "data/processed/m_raw.rds")
|
||||||
2
src/municipalities/01-sampling.R
Normal file
2
src/municipalities/01-sampling.R
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# 01-Read ----------------------------------------------------------------------
|
||||||
|
municipalities_raw <- read_rds("data/processed/m_raw.rds")
|
||||||
Loading…
Add table
Reference in a new issue