library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1       v purrr   0.3.2  
v tibble  2.1.1       v dplyr   0.8.0.1
v tidyr   0.8.3       v stringr 1.4.0  
v readr   1.3.1       v forcats 0.4.0  
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


certificates <- read.csv("~/11.2020_Project Cygnus/20.11.2020_EPC_Leeds/domestic-E08000035-Leeds/certificates.csv", 
                         na = c("INVALID", "NO DATA!", "NODATA!", "N/A", "Not recorded", "9999", "Blank", "NULL", "", 
                                "INVALID!", "unknown", "Unknown", "%%MAINHEATCONTROL%%", "not defined"))
certificates[] <- lapply(certificates, function(x) replace (x,grepl("for backwards compatibility only", x), NA))
certificates[] <- lapply(certificates, function(x) replace (x, grepl("for backward compatibility only", x), NA))


sum(is.na(certificates))


clean_EPC <- certificates[order(rev(certificates[, 'LODGEMENT_DATETIME'])),]
clean_EPC <- clean_EPC[!duplicated(select(clean_EPC, -c("LMK_KEY", 
                                                        "LODGEMENT_DATE", 
                                                        "LODGEMENT_DATETIME"))),]


clean_EPC <- clean_EPC %>%
    select(-c(ADDRESS1, ADDRESS2, ADDRESS3, LOCAL_AUTHORITY, CONSTITUENCY, COUNTY, 
              ADDRESS, LOCAL_AUTHORITY_LABEL, CONSTITUENCY_LABEL, POSTTOWN))


Col_NA_Perc <- sapply(clean_EPC, function(x) sum(is.na(x)) / length(x))
Col_NA_Perc <- Col_NA_Perc[Col_NA_Perc > .7] # more than 70% NAs
print(paste0(names(Col_NA_Perc)," :",unname(Col_NA_Perc)))

[1] "FLOOR_LEVEL :0.704401932924234"             
[2] "FLAT_TOP_STOREY :0.74461856321129"          
[3] "FLAT_STOREY_COUNT :0.907911535768708"       
[4] "FLOOR_ENERGY_EFF :0.942468734126821"        
[5] "FLOOR_ENV_EFF :0.942468734126821"           
[6] "SHEATING_ENERGY_EFF :1"                     
[7] "SHEATING_ENV_EFF :1"                        
[8] "HEAT_LOSS_CORRIDOOR :0.74461856321129"      
[9] "UNHEATED_CORRIDOR_LENGTH :0.888626909929575"


clean_EPC <- clean_EPC[ lapply(clean_EPC, function(x) sum(is.na(x)) / length(x) ) < .7 ]


NSPL <- read.csv("~/11.2020_Project Cygnus/20.11.2020_EPC_Leeds/domestic-E08000035-Leeds/Postcode File/National_Statistics_Postcode_Lookup_UK.csv")

NSPL_LSOA <- NSPL %>% 
  select(Postcode.3, Local.Authority.Code)

clean_EPC <- inner_join(clean_EPC, NSPL_LSOA, by = c("POSTCODE" = "Postcode.3")) %>%
    select(-c(POSTCODE))

Warning message:
"Column `POSTCODE`/`Postcode.3` joining factors with different levels, coercing to character vector"


Clean_Outcome <- matrix(c(ncol(certificates), nrow(certificates), sum(is.na(certificates)),
                          ncol(clean_EPC), nrow(clean_EPC), sum(is.na(clean_EPC)), 
                          (ncol(certificates) - ncol(clean_EPC)), (nrow(certificates) - nrow(clean_EPC)), 
                          (sum(is.na(certificates)) - sum(is.na(clean_EPC)))),ncol=3,byrow=TRUE)
colnames(Clean_Outcome) <- c("Columns","Rows","NAs")
rownames(Clean_Outcome) <- c("Original","Final","Removed")
Clean_Outcome <- as.table(Clean_Outcome)
Clean_Outcome

         Columns    Rows     NAs
Original      90  291914 4411956
Final         71  290876 1362174
Removed       19    1038 3049782


write.csv(clean_EPC, 'clean_EPC.csv')

EPC Cleaner (on the example of Leeds)¶

Packages¶

Load EPC Data & Standardise Missing Values¶

Removing Duplicate Entries¶

Redundant Columns - Removing Geographies¶

Removing Columns with a lot of Missing Values¶

SIDENOTE: Identify the columns that contain more than a certain percentage of NAs¶

Remove columns that contain more than a certain percentage of NAs¶

Adding LSOAs & Removing Postcodes¶

Outcome¶

Save a Copy of the Cleaned Data¶

Useful References¶