# ============================================================
#  PPT CONTENT — Empirical urban-rural typology of Swedish municipalities
#  Correspondence Analysis + hierarchical clustering (2022 sampling)
#  Six-cluster cut
#
#  Run from the project root (ruralitic-qrm/).
#  Figures are saved to ppt/figures/. Slide text is in the
#  comments below each section header.
# ============================================================

library(tidyverse)
library(FactoMineR)
library(factoextra)
library(ggrepel)
library(showtext)

font_add_google("Source Sans 3", "source_sans_3")
showtext_auto()

theme_ppt <- theme_minimal(base_size = 14, base_family = "source_sans_3") +
  theme(
    panel.grid.minor  = element_blank(),
    panel.grid.major  = element_line(colour = "grey92"),
    legend.position   = "bottom",
    legend.title      = element_text(face = "bold"),
    plot.margin       = margin(8, 12, 8, 8)
  )
theme_set(theme_ppt)

cluster_labels <- c(
  "1" = "Remote & peripheral",
  "2" = "Central industrial towns",
  "3" = "Peri-rural commuter belt",
  "4" = "Regional service centres",
  "5" = "Affluent suburbs & university satellites",
  "6" = "Inner Stockholm core"
)
cluster_palette <- c(
  "1" = "#B07F4F",
  "2" = "#D7A86E",
  "3" = "#7CB07C",
  "4" = "#6FA8DC",
  "5" = "#C2738B",
  "6" = "#7A3A4F"
)

county_names <- c(
  "01" = "Stockholm",    "03" = "Uppsala",       "04" = "Södermanland",
  "05" = "Östergötland", "06" = "Jönköping",     "07" = "Kronoberg",
  "08" = "Kalmar",       "09" = "Gotland",       "10" = "Blekinge",
  "12" = "Skåne",        "13" = "Halland",       "14" = "Västra Götaland",
  "17" = "Värmland",     "18" = "Örebro",        "19" = "Västmanland",
  "20" = "Dalarna",      "21" = "Gävleborg",     "22" = "Västernorrland",
  "23" = "Jämtland",     "24" = "Västerbotten",  "25" = "Norrbotten"
)

# Load data
afc      <- read_rds("data/processed/proportions_CA.rds")
hcpc     <- read_rds("data/processed/proportions_HCPC_6.rds")
clusters <- read_csv("data/processed/cluster_assignment_6.csv",
                     show_col_types = FALSE) |>
  rename(cluster = c6) |>
  mutate(cluster_label = cluster_labels[as.character(cluster)])

panel_raw <- readxl::read_excel("data/Municipalities_db_2.xlsx",
                                col_types = "text", n_max = 290) |>
  transmute(municipality, code = str_pad(code, 4, "left", "0"))

dim1_pct <- round(afc$eig[1, 2], 1)
dim2_pct <- round(afc$eig[2, 2], 1)

row_df <- as.data.frame(afc$row$coord[, 1:2]) |>
  rownames_to_column("municipality") |>
  left_join(clusters, by = "municipality")


# ============================================================
#  SLIDE 1 — Data & motivation
#  Title: "Building an urban–rural typology from the data"
#
#  Bullet points:
#   • Sweden's 290 municipalities are routinely classified by
#     administrative or population-size rules (SCB). These categories
#     are imposed from outside the data.
#   • This analysis asks instead: how do municipalities actually differ
#     across structural dimensions?
#   • Data source: Statistics Sweden, 2022 project sampling.
#   • Six ACTIVE variable blocks:
#       Education (4 attainment levels)
#       Employment (16 activity sectors)
#       Housing (rented / tenant-owned / owner-occupied)
#       Workplace mobility (commuters in, commuters out, working locally)
#       Migration (in- and outmigration)
#       Demography (retirees, number of localities)
#   • Two SUPPLEMENTARY blocks projected post-hoc:
#       Educational provision (pre-school through HE, by ownership)
#       Opinion (survey satisfaction with local schools)
#   • Key pre-processing: block normalisation. Within each block,
#     every municipality is rescaled to the same total, preventing
#     Stockholm from dominating the analysis due to sheer size.
#
#  No figure needed — use the variable block list as a visual
#  schematic or table in the slide.
# ============================================================


# ============================================================
#  SLIDE 2 — Correspondence analysis: the space of municipalities
#  Title: "Two dimensions capture two-thirds of the variation"
#
#  Bullet points:
#   • CA places all 290 municipalities in a low-dimensional space
#     where proximity = similarity on the active variables.
#   • Dim 1 (rural–urban): Left pole → agriculture, mining &
#     manufacturing, owner-occupied housing, upper-secondary education.
#     Right pole → IT, finance, professional services, post-graduate
#     attainment, apartment housing.
#   • Dim 2 (labour-market self-containment): Top → residents work
#     where they live (Göteborg, Malmö, Umeå). Bottom → outbound
#     commuters, residential satellites (Knivsta, Salem, Staffanstorp).
#   • Together Dim 1 + Dim 2 account for ~65% of total variability.
#
#  Figure: slide2_biplot.png
# ============================================================

label_munis <- c("Stockholm", "Göteborg", "Malmö", "Uppsala", "Lund", "Umeå",
                 "Linköping", "Solna", "Danderyd", "Kiruna", "Gotland",
                 "Knivsta", "Falköping", "Tomelilla", "Skellefteå", "Piteå",
                 "Partille", "Sundbyberg", "Lindesberg")

contribs <- as.data.frame(afc$col$contrib) |>
  rownames_to_column("variable") |>
  mutate(total = `Dim 1` + `Dim 2`) |>
  arrange(desc(total)) |>
  head(15)

col_df <- as.data.frame(afc$col$coord[, 1:2]) |>
  rownames_to_column("variable") |>
  filter(variable %in% contribs$variable)

fig_biplot <- ggplot() +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60") +
  geom_point(data = row_df,
             aes(`Dim 1`, `Dim 2`, colour = cluster_label),
             alpha = 0.75, size = 2.2) +
  geom_text_repel(
    data    = row_df |> filter(municipality %in% label_munis),
    aes(`Dim 1`, `Dim 2`, label = municipality),
    size = 3.5, colour = "grey20", family = "source_sans_3",
    max.overlaps = 30, segment.size = 0.25
  ) +
  geom_point(data = col_df, aes(`Dim 1`, `Dim 2`),
             shape = 17, colour = "firebrick", size = 3) +
  geom_text_repel(
    data = col_df, aes(`Dim 1`, `Dim 2`, label = variable),
    colour = "firebrick", size = 3, family = "source_sans_3",
    max.overlaps = 30, segment.size = 0.25
  ) +
  scale_colour_manual(values = cluster_palette |> set_names(cluster_labels),
                      name = NULL) +
  labs(
    x = paste0("Dim 1 — rural–urban (", dim1_pct, "%)"),
    y = paste0("Dim 2 — labour-market self-containment (", dim2_pct, "%)")
  ) +
  guides(colour = guide_legend(nrow = 2))

ggsave("ppt/figures/slide2_biplot.png", fig_biplot,
       width = 11, height = 7, dpi = 150)
message("Saved: ppt/figures/slide2_biplot.png")


# ============================================================
#  SLIDE 3 — Six empirical types of municipality
#  Title: "Six coherent types emerge from Ward clustering"
#
#  Cluster descriptions:
#
#  Cl 1 — Remote & peripheral
#    Most rural extreme. Agriculture/forestry/fishing, sparsely
#    populated, own full educational infrastructure at every age
#    (komvux, preschool). Examples: Kiruna, Piteå, Skellefteå, Gotland.
#
#  Cl 2 — Central industrial towns
#    Rural-industrial, but not as remote. Mining & manufacturing
#    dominant. Owner-occupied housing, upper-secondary ceiling.
#    Examples: Falköping, Lindesberg, Hedemora.
#
#  Cl 3 — Peri-rural commuter belt
#    Small southern and central rural municipalities. Many residents
#    commute out. Owner-occupied, construction and agriculture visible.
#    Below-average satisfaction with local high schools.
#    Examples: Tomelilla, Osby, Klippan, Sölvesborg.
#
#  Cl 4 — Regional service centres
#    Mid-sized cities with self-contained labour markets. Rented and
#    tenant-owned housing, public administration, post-secondary
#    attainment. Examples: Göteborg, Malmö, Umeå, Linköping, Kalmar.
#
#  Cl 5 — Affluent suburbs & university satellites
#    Outbound commuters, post-secondary attainment, tenant-owned
#    housing. Residential satellites whose labour markets sit elsewhere.
#    Examples: Lund, Mölndal, Partille, Huddinge, Knivsta, Kungsbacka.
#
#  Cl 6 — Inner Stockholm core
#    Inbound commuting, IT and finance employment, apartment housing,
#    post-graduate attainment at extreme levels. These are destinations
#    in the commuting network, not origins.
#    Examples: Stockholm, Solna, Sundbyberg, Danderyd, Lidingö, Täby.
#
#  Figures: slide3_clusters.png (main) · slide3_dendrogram.png (inset)
# ============================================================

centroids <- row_df |>
  group_by(cluster_label) |>
  summarise(`Dim 1` = mean(`Dim 1`), `Dim 2` = mean(`Dim 2`), .groups = "drop")

fig_clusters <- ggplot(row_df, aes(`Dim 1`, `Dim 2`, colour = cluster_label)) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey70") +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey70") +
  geom_point(alpha = 0.5, size = 2) +
  stat_ellipse(level = 0.68, linewidth = 0.7) +
  geom_point(data = centroids, size = 5, shape = 18, colour = "grey15") +
  geom_label_repel(
    data  = centroids, aes(label = cluster_label),
    fill  = "white", colour = "black",
    family = "source_sans_3", size = 3.5,
    label.size = 0.25, label.padding = unit(0.3, "lines"),
    min.segment.length = 0, max.overlaps = 20
  ) +
  scale_colour_manual(values = cluster_palette |> set_names(cluster_labels),
                      guide = "none") +
  labs(
    x = paste0("Dim 1 — rural–urban (", dim1_pct, "%)"),
    y = paste0("Dim 2 — labour-market self-containment (", dim2_pct, "%)")
  )

ggsave("ppt/figures/slide3_clusters.png", fig_clusters,
       width = 11, height = 7, dpi = 150)
message("Saved: ppt/figures/slide3_clusters.png")

# Dendrogram with 6-cluster cut
tree  <- hcpc$call$t$tree
h_max <- max(tree$height)

fig_dendro <- fviz_dend(
  tree, k = 6, show_labels = FALSE,
  rect = TRUE,
  rect_border = unname(cluster_palette),
  rect_fill   = TRUE,
  k_colors    = unname(cluster_palette),
  main = "", ylab = "Merge distance (Ward)"
) +
  coord_cartesian(ylim = c(0, h_max * 1.05)) +
  guides(linewidth = "none") +
  theme(plot.title = element_blank(),
        text = element_text(family = "source_sans_3", size = 13))

ggsave("ppt/figures/slide3_dendrogram.png", fig_dendro,
       width = 11, height = 4.5, dpi = 150)
message("Saved: ppt/figures/slide3_dendrogram.png")


# ============================================================
#  SLIDE 4 — Geography & key takeaways
#  Title: "The typology maps coherently onto Swedish geography"
#
#  Bullet points:
#   • Clusters 1 & 2 (both rural types) dominate almost everywhere
#     outside the metropolitan areas — the rural majority.
#   • Cluster 6 (inner Stockholm core) is confined to Stockholm and
#     Uppsala counties; cluster 5 (affluent suburbs) spreads into
#     Skåne (Lund) and Västra Götaland (Mölndal, Partille).
#   • Cluster 4 (regional centres) appears thinly but consistently
#     across most counties — one or two per county.
#
#  Key takeaways:
#   • Biggest empirical break: NOT metro vs. non-metro, but between
#     two kinds of rural — remote & peripheral (Cl. 1), central
#     industrial towns (Cl. 2), and peri-rural commuters (Cl. 3).
#   • At the urban end, two kinds of city: self-contained regional
#     centres (Cl. 4) vs. the metropolitan region (Cl. 5 & 6).
#   • Within the metropolitan region, the data cleanly separate
#     residential suburbs that commute IN (Cl. 5) from the inner core
#     that receives commuters (Cl. 6).
#
#  Figure: slide4_county.png
# ============================================================

county_order <- c(
  "Skåne", "Blekinge", "Halland", "Kronoberg", "Kalmar", "Gotland",
  "Jönköping", "Östergötland", "Södermanland", "Västra Götaland",
  "Örebro", "Västmanland", "Stockholm", "Uppsala", "Dalarna", "Värmland",
  "Gävleborg", "Västernorrland", "Jämtland", "Västerbotten", "Norrbotten"
)

clusters_geo <- clusters |>
  left_join(panel_raw, by = "municipality") |>
  mutate(county = county_names[str_sub(code, 1, 2)]) |>
  filter(!is.na(county))

fig_county <- clusters_geo |>
  count(county, cluster_label) |>
  mutate(
    county        = factor(county, levels = county_order),
    cluster_label = factor(cluster_label, levels = cluster_labels)
  ) |>
  ggplot(aes(county, n, fill = cluster_label)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = cluster_palette |> set_names(cluster_labels),
                    name = NULL) +
  scale_y_continuous(labels = scales::percent_format(), expand = c(0, 0)) +
  labs(x = NULL, y = "Share of municipalities") +
  theme(
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 11),
    legend.position = "bottom",
    legend.text     = element_text(size = 10)
  ) +
  guides(fill = guide_legend(nrow = 2))

ggsave("ppt/figures/slide4_county.png", fig_county,
       width = 12, height = 6.5, dpi = 150)
message("Saved: ppt/figures/slide4_county.png")

message("\nAll figures written to ppt/figures/. Ready to paste into the slide deck.")