Script to extract All-Party Parliamentary Group (APPG) information. For each APPG listed in Parliament’s Register of All-Party Parliamentary Groups (saved locally but could be scraped), extracts information (stored as tables) into a nested list. Script utilises the 18 June 2025 Register which contains information on 514 individual Groups.
library(dplyr)
library(readr)
library(rvest)
library(stringr)
library(purrr)
library(jsonlite)
Load in APPG webpages (saved locally).
# Get files
files <- list.files(path="./appg_webpages", full.names=TRUE, recursive=FALSE)
# Filter out non-HTML files
files <- files[!grepl("\\.txt$", files)]
# Get APPG names
appg_names <- str_extract(files, "(?<=2025:).*?(?=\\(29_06)") |>
str_trim()
# Read HTML code as string
html_vector <- lapply(files, read_file)
Populate tibble with APPG names and corresponding webpage’s HTML code.
# Populate dataframe with APPG names and HTML code
raw_appg_df <- tibble(
name = appg_names,
html_code = html_vector
)
# Preview HTML code (500 chars)
substr(raw_appg_df$html_code[[1]], 1, 500)
## [1] "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"> <html xmlns=http://www.w3.org/1999/xhtml lang=en xml:lang=en><!--\n Page saved with SingleFile \n url: https://publications.parliament.uk/pa/cm/cmallparty/250618/abraham-accords.htm \n saved date: Sun Jun 29 2025 23:00:16 GMT+0100 (British Summer Time)\n--><meta charset=utf-8>\n<style media=projection,screen>HTML,BODY,DIV,SPAN,IFRAME,H1,P,A,IMG,STRONG,OL,UL,LI,FIELDSET,FORM,TABLE,TBODY,TR,TD{"
Demo: reconstruct webpage from HTML string.
tmp_file <- tempfile(fileext = ".html")
writeLines(html_vector[[1]], tmp_file)
# Open in browser
browseURL(tmp_file)
Demo: read in APPG’s HTML code and extract tables.
test_page <- raw_appg_df$html_code[[1]] %>%
read_html()
page_tables <- html_table(test_page)
page_tables
## [[1]]
## # A tibble: 3 × 2
## X1 X2
## <chr> <chr>
## 1 Title All-Party Parliamentary Group on the Abraham Accords
## 2 Purpose The APPG will champion the historic peace agreements known as the Ab…
## 3 Category Subject Group
##
## [[2]]
## # A tibble: 6 × 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 Officers Officers Officers
## 2 Role Name Party
## 3 Chair & Registered Contact Sir Andrew Mitchell Conservative
## 4 Co-Chair Lord Mendelsohn Labour
## 5 Officer Charlie Dewhirst Conservative
## 6 Officer Lord Polak Conservative
##
## [[3]]
## # A tibble: 2 × 1
## X1
## <chr>
## 1 "Contact Details"
## 2 "Registered Contact:\n Sir Andrew Mitchell MP, House of Commons, London, SW1A…
##
## [[4]]
## # A tibble: 5 × 2
## X1 X2
## <chr> <chr>
## 1 Annual General Meeting Annu…
## 2 Date of most recent AGM in this Parliament 24/0…
## 3 Did the group publish an income and expenditure statement relating to t… No
## 4 Reporting year 24 M…
## 5 Next reporting deadline 24/0…
##
## [[5]]
## # A tibble: 2 × 1
## X1
## <chr>
## 1 Registrable benefits received by the group
## 2 None
Main script: extracts APPG information into a nested list. Reads in HTML code corresponding to an APPG’s webpage from a dataframe, extracts tables, and plucks values.
appg <- list()
# For each row in APPG dataframe
for (n in 1:nrow(raw_appg_df)) {
# Parse HTML and extract tables
appg_tables <- raw_appg_df$html_code[[n]] %>%
read_html() %>%
html_table()
# Extract table values
entry <- "2025-06-18"
appg_name <- pluck(appg_tables, 1, "X2", 1)
appg_purpose <- pluck(appg_tables, 1, "X2", 2)
appg_category <- pluck(appg_tables, 1, "X2", 3)
contact <- pluck(appg_tables, 3, "X1", 2) %>%
str_replace_all("\n", "")
most_recent_agm <- as.Date(pluck(appg_tables, 4, "X2", 2), format = "%d/%m/%Y")
published_statement <- pluck(appg_tables, 4, "X2", 3)
reporting_year <- pluck(appg_tables, 4, "X2", 4)
next_rep_deadline <- as.Date(pluck(appg_tables, 4, "X2", 5), format = "%d/%m/%Y")
# Loop through Parliamentarians involved in APPG (number of Parliamentarians varies)
parliamentarian <- list()
if (length(appg_tables) >= 2 && !is.null(appg_tables[[2]]$X1)) {
for (m in 3:length(appg_tables[[2]]$X1)) {
member <- list(
member_name = pluck(appg_tables, 2, "X2", m),
member_party = pluck(appg_tables, 2, "X3", m),
member_role = pluck(appg_tables, 2, "X1", m)
)
parliamentarian <- c(parliamentarian, list(member))
}
}
# Check for benefit information
benefits <- list()
val <- pluck(appg_tables, 5, "X1", 2, .default = NA)
# If no benefits, set to FALSE and child items to NA
if (!is.na(val) && val == "None") {
benefit_received <- FALSE
benefits <- list(list(
source = NA,
description = NA,
value = NA,
received = NA,
registered = NA
))
# If benefits received, pluck extract values (number of benefits varies)
} else {
benefit_received <- TRUE
if (length(appg_tables) >= 6 && !is.null(appg_tables[[6]]$X1)) {
for (i in 3:length(appg_tables[[6]]$X1)) {
benefit <- list(list(
source = pluck(appg_tables, 6, "X1", i),
description = pluck(appg_tables, 6, "X2", i),
value = pluck(appg_tables, 6, "X3", i),
received = pluck(appg_tables, 6, "X4", i),
registered = pluck(appg_tables, 6, "X5", i)
))
benefits <- c(benefits, benefit)
}
}
}
# Combine everything for this APPG
temp_list <- list(
appg_name = appg_name,
appg_purpose = appg_purpose,
appg_category = appg_category,
parliamentarian = parliamentarian,
contact = contact,
most_recent_agm = most_recent_agm,
published_statement = published_statement,
benefit_received = benefit_received,
benefits = benefits
)
# Append to result list
appg <- c(appg, list(temp_list))
}
length(appg)
## [1] 514
cat("\n")
appg[[1]]
## $appg_name
## [1] "All-Party Parliamentary Group on the Abraham Accords"
##
## $appg_purpose
## [1] "The APPG will champion the historic peace agreements known as the Abraham Accords."
##
## $appg_category
## [1] "Subject Group"
##
## $parliamentarian
## $parliamentarian[[1]]
## $parliamentarian[[1]]$member_name
## [1] "Sir Andrew Mitchell"
##
## $parliamentarian[[1]]$member_party
## [1] "Conservative"
##
## $parliamentarian[[1]]$member_role
## [1] "Chair & Registered Contact"
##
##
## $parliamentarian[[2]]
## $parliamentarian[[2]]$member_name
## [1] "Lord Mendelsohn"
##
## $parliamentarian[[2]]$member_party
## [1] "Labour"
##
## $parliamentarian[[2]]$member_role
## [1] "Co-Chair"
##
##
## $parliamentarian[[3]]
## $parliamentarian[[3]]$member_name
## [1] "Charlie Dewhirst"
##
## $parliamentarian[[3]]$member_party
## [1] "Conservative"
##
## $parliamentarian[[3]]$member_role
## [1] "Officer"
##
##
## $parliamentarian[[4]]
## $parliamentarian[[4]]$member_name
## [1] "Lord Polak"
##
## $parliamentarian[[4]]$member_party
## [1] "Conservative"
##
## $parliamentarian[[4]]$member_role
## [1] "Officer"
##
##
##
## $contact
## [1] "Registered Contact: Sir Andrew Mitchell MP, House of Commons, London, SW1A 0AA. Tel: 020 7219 8516. Email: andrew.mitchell.mp@parliament.uk Secretariat: UK Abraham Accords Group acts as the group's secretariat. https://www.abraham-accords.uk"
##
## $most_recent_agm
## [1] "2025-03-24"
##
## $published_statement
## [1] "No"
##
## $benefit_received
## [1] FALSE
##
## $benefits
## $benefits[[1]]
## $benefits[[1]]$source
## [1] NA
##
## $benefits[[1]]$description
## [1] NA
##
## $benefits[[1]]$value
## [1] NA
##
## $benefits[[1]]$received
## [1] NA
##
## $benefits[[1]]$registered
## [1] NA
Save output. To re-read into memory: readRDS(“all_appgs.RData”).
saveRDS(appg, "all_appgs.RData")
Convert nested list to JSON and save. To convert back: fromJSON(txt, simplifyDataFrame = FALSE, flatten = FALSE).
appg_json <- toJSON(appg, pretty = TRUE, auto_unbox = TRUE)
write(appg_json, file = "all_appgs.json")