Script to extract All-Party Parliamentary Group (APPG) information. For each APPG listed in Parliament’s Register of All-Party Parliamentary Groups (saved locally but could be scraped), extracts information (stored as tables) into a nested list. Script utilises the 18 June 2025 Register which contains information on 514 individual Groups.

library(dplyr)
library(readr)
library(rvest)
library(stringr)
library(purrr)
library(jsonlite)

Load in APPG webpages (saved locally).

# Get files
files <- list.files(path="./appg_webpages", full.names=TRUE, recursive=FALSE)

# Filter out non-HTML files
files <- files[!grepl("\\.txt$", files)]

# Get APPG names
appg_names <- str_extract(files, "(?<=2025:).*?(?=\\(29_06)") |>
   str_trim()

# Read HTML code as string
html_vector <- lapply(files, read_file)

Populate tibble with APPG names and corresponding webpage’s HTML code.

# Populate dataframe with APPG names and HTML code
raw_appg_df <- tibble(
   name = appg_names,
   html_code = html_vector
)

# Preview HTML code (500 chars)
substr(raw_appg_df$html_code[[1]], 1, 500)
## [1] "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"> <html xmlns=http://www.w3.org/1999/xhtml lang=en xml:lang=en><!--\n Page saved with SingleFile \n url: https://publications.parliament.uk/pa/cm/cmallparty/250618/abraham-accords.htm \n saved date: Sun Jun 29 2025 23:00:16 GMT+0100 (British Summer Time)\n--><meta charset=utf-8>\n<style media=projection,screen>HTML,BODY,DIV,SPAN,IFRAME,H1,P,A,IMG,STRONG,OL,UL,LI,FIELDSET,FORM,TABLE,TBODY,TR,TD{"

Demo: reconstruct webpage from HTML string.

tmp_file <- tempfile(fileext = ".html")

writeLines(html_vector[[1]], tmp_file)

# Open in browser
browseURL(tmp_file)

Demo: read in APPG’s HTML code and extract tables.

test_page <- raw_appg_df$html_code[[1]] %>% 
   read_html()

page_tables <- html_table(test_page)
page_tables
## [[1]]
## # A tibble: 3 × 2
##   X1       X2                                                                   
##   <chr>    <chr>                                                                
## 1 Title    All-Party Parliamentary Group on the Abraham Accords                 
## 2 Purpose  The APPG will champion the historic peace agreements known as the Ab…
## 3 Category Subject Group                                                        
## 
## [[2]]
## # A tibble: 6 × 3
##   X1                         X2                  X3          
##   <chr>                      <chr>               <chr>       
## 1 Officers                   Officers            Officers    
## 2 Role                       Name                Party       
## 3 Chair & Registered Contact Sir Andrew Mitchell Conservative
## 4 Co-Chair                   Lord Mendelsohn     Labour      
## 5 Officer                    Charlie Dewhirst    Conservative
## 6 Officer                    Lord Polak          Conservative
## 
## [[3]]
## # A tibble: 2 × 1
##   X1                                                                            
##   <chr>                                                                         
## 1 "Contact Details"                                                             
## 2 "Registered Contact:\n Sir Andrew Mitchell MP, House of Commons, London, SW1A…
## 
## [[4]]
## # A tibble: 5 × 2
##   X1                                                                       X2   
##   <chr>                                                                    <chr>
## 1 Annual General Meeting                                                   Annu…
## 2 Date of most recent AGM in this Parliament                               24/0…
## 3 Did the group publish an income and expenditure statement relating to t… No   
## 4 Reporting year                                                           24 M…
## 5 Next reporting deadline                                                  24/0…
## 
## [[5]]
## # A tibble: 2 × 1
##   X1                                        
##   <chr>                                     
## 1 Registrable benefits received by the group
## 2 None

Main script: extracts APPG information into a nested list. Reads in HTML code corresponding to an APPG’s webpage from a dataframe, extracts tables, and plucks values.

appg <- list()

# For each row in APPG dataframe
for (n in 1:nrow(raw_appg_df)) {
   
   # Parse HTML and extract tables
   appg_tables <- raw_appg_df$html_code[[n]] %>% 
      read_html() %>% 
      html_table()
   
   # Extract table values
   entry <- "2025-06-18"
   
   appg_name <- pluck(appg_tables, 1, "X2", 1)
   
   appg_purpose <- pluck(appg_tables, 1, "X2", 2)
   
   appg_category <- pluck(appg_tables, 1, "X2", 3)
   
   contact <- pluck(appg_tables, 3, "X1", 2) %>% 
      str_replace_all("\n", "")
   
   most_recent_agm <- as.Date(pluck(appg_tables, 4, "X2", 2), format = "%d/%m/%Y")
   
   published_statement <- pluck(appg_tables, 4, "X2", 3)
   
   reporting_year <- pluck(appg_tables, 4, "X2", 4)
   
   next_rep_deadline <- as.Date(pluck(appg_tables, 4, "X2", 5), format = "%d/%m/%Y")
   
   # Loop through Parliamentarians involved in APPG (number of Parliamentarians varies)
   parliamentarian <- list()
   
   if (length(appg_tables) >= 2 && !is.null(appg_tables[[2]]$X1)) {
      for (m in 3:length(appg_tables[[2]]$X1)) {
         member <- list(
            member_name  = pluck(appg_tables, 2, "X2", m),
            member_party = pluck(appg_tables, 2, "X3", m),
            member_role  = pluck(appg_tables, 2, "X1", m)
         )
         parliamentarian <- c(parliamentarian, list(member))
      }
   }
   
   # Check for benefit information
   benefits <- list()
   val <- pluck(appg_tables, 5, "X1", 2, .default = NA)
   
   # If no benefits, set to FALSE and child items to NA
   if (!is.na(val) && val == "None") {
      benefit_received <- FALSE
      
      benefits <- list(list(
         source      = NA,
         description = NA,
         value       = NA, 
         received    = NA, 
         registered  = NA
      ))
      
   # If benefits received, pluck extract values (number of benefits varies)
   } else {
      benefit_received <- TRUE
      
      if (length(appg_tables) >= 6 && !is.null(appg_tables[[6]]$X1)) {
         for (i in 3:length(appg_tables[[6]]$X1)) {
            benefit <- list(list(
               source      = pluck(appg_tables, 6, "X1", i),
               description = pluck(appg_tables, 6, "X2", i),
               value       = pluck(appg_tables, 6, "X3", i),
               received    = pluck(appg_tables, 6, "X4", i),
               registered  = pluck(appg_tables, 6, "X5", i)
            ))
            benefits <- c(benefits, benefit)
         }
      }
   }
   
   # Combine everything for this APPG
   temp_list <- list(
      appg_name           = appg_name, 
      appg_purpose        = appg_purpose, 
      appg_category       = appg_category,
      parliamentarian     = parliamentarian,
      contact             = contact, 
      most_recent_agm     = most_recent_agm,
      published_statement = published_statement, 
      benefit_received    = benefit_received, 
      benefits            = benefits
   )
   
   # Append to result list
   appg <- c(appg, list(temp_list))
}

length(appg)
## [1] 514
cat("\n")
appg[[1]]
## $appg_name
## [1] "All-Party Parliamentary Group on the Abraham Accords"
## 
## $appg_purpose
## [1] "The APPG will champion the historic peace agreements known as the Abraham Accords."
## 
## $appg_category
## [1] "Subject Group"
## 
## $parliamentarian
## $parliamentarian[[1]]
## $parliamentarian[[1]]$member_name
## [1] "Sir Andrew Mitchell"
## 
## $parliamentarian[[1]]$member_party
## [1] "Conservative"
## 
## $parliamentarian[[1]]$member_role
## [1] "Chair & Registered Contact"
## 
## 
## $parliamentarian[[2]]
## $parliamentarian[[2]]$member_name
## [1] "Lord Mendelsohn"
## 
## $parliamentarian[[2]]$member_party
## [1] "Labour"
## 
## $parliamentarian[[2]]$member_role
## [1] "Co-Chair"
## 
## 
## $parliamentarian[[3]]
## $parliamentarian[[3]]$member_name
## [1] "Charlie Dewhirst"
## 
## $parliamentarian[[3]]$member_party
## [1] "Conservative"
## 
## $parliamentarian[[3]]$member_role
## [1] "Officer"
## 
## 
## $parliamentarian[[4]]
## $parliamentarian[[4]]$member_name
## [1] "Lord Polak"
## 
## $parliamentarian[[4]]$member_party
## [1] "Conservative"
## 
## $parliamentarian[[4]]$member_role
## [1] "Officer"
## 
## 
## 
## $contact
## [1] "Registered Contact: Sir Andrew Mitchell MP, House of Commons, London, SW1A 0AA. Tel: 020 7219 8516. Email: andrew.mitchell.mp@parliament.uk    Secretariat: UK Abraham Accords Group acts as the group's secretariat. https://www.abraham-accords.uk"
## 
## $most_recent_agm
## [1] "2025-03-24"
## 
## $published_statement
## [1] "No"
## 
## $benefit_received
## [1] FALSE
## 
## $benefits
## $benefits[[1]]
## $benefits[[1]]$source
## [1] NA
## 
## $benefits[[1]]$description
## [1] NA
## 
## $benefits[[1]]$value
## [1] NA
## 
## $benefits[[1]]$received
## [1] NA
## 
## $benefits[[1]]$registered
## [1] NA

Save output. To re-read into memory: readRDS(“all_appgs.RData”).

saveRDS(appg, "all_appgs.RData")

Convert nested list to JSON and save. To convert back: fromJSON(txt, simplifyDataFrame = FALSE, flatten = FALSE).

appg_json <- toJSON(appg, pretty = TRUE, auto_unbox = TRUE)

write(appg_json, file = "all_appgs.json")