import json, os
from zipfile import ZipFile
import pandas as pd
import geopandas as gpd
ZIPPED_EPC_DATA_PATH = 'data/raw/all-domestic-certificates.zip'
ZIPPED_ONS_POSTCODE_LOOKUP = 'data/raw/PCD_OA_LSOA_MSOA_LAD_NOV20_UK_LU.zip'
LEEDS_EPC_FILENAME = 'domestic-E08000035-Leeds/certificates.csv'
PCD_LOOKUP_FILENAME = 'PCD_OA_LSOA_MSOA_LAD_NOV20_UK_LU.csv'
EXTRACT_DIR = 'data/extracts'
OUTPUT_DIR = 'data/output'
def extract_single_file_from_zip(zip_filepath, file_to_extract, extract_dir):
output_filepath = f'{extract_dir}/{file_to_extract}'
if not os.path.exists(output_filepath):
print(f'{output_filepath} does not exist')
with ZipFile(zip_filepath, mode='r') as zipped:
output_filepath = zipped.extract(file_to_extract, path=extract_dir)
return output_filepath
extracted_epc_fp = extract_single_file_from_zip(ZIPPED_EPC_DATA_PATH, LEEDS_EPC_FILENAME, EXTRACT_DIR)
extracted_pcd_lookup_fp = extract_single_file_from_zip(ZIPPED_ONS_POSTCODE_LOOKUP, PCD_LOOKUP_FILENAME, EXTRACT_DIR)
data/extracts/domestic-E08000035-Leeds/certificates.csv does not exist data/extracts/PCD_OA_LSOA_MSOA_LAD_NOV20_UK_LU.csv does not exist
epc_data = pd.read_csv(
extracted_epc_fp,
low_memory=False,
usecols=[
'POSTCODE',
'BUILDING_REFERENCE_NUMBER',
'CURRENT_ENERGY_RATING',
'POTENTIAL_ENERGY_RATING',
'CURRENT_ENERGY_EFFICIENCY',
'POTENTIAL_ENERGY_EFFICIENCY',
'INSPECTION_DATE',
'TRANSACTION_TYPE',
'CONSTRUCTION_AGE_BAND'
],
parse_dates=['INSPECTION_DATE'],
infer_datetime_format=True,
na_values=['INVALID!', 'NO DATA!', 'NODATA!' 'not recorded', 'blank'],
keep_default_na=True
)
epc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 291914 entries, 0 to 291913 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 POSTCODE 291914 non-null object 1 BUILDING_REFERENCE_NUMBER 291914 non-null int64 2 CURRENT_ENERGY_RATING 291914 non-null object 3 POTENTIAL_ENERGY_RATING 291914 non-null object 4 CURRENT_ENERGY_EFFICIENCY 291914 non-null int64 5 POTENTIAL_ENERGY_EFFICIENCY 291914 non-null int64 6 INSPECTION_DATE 291914 non-null datetime64[ns] 7 TRANSACTION_TYPE 291914 non-null object 8 CONSTRUCTION_AGE_BAND 264008 non-null object dtypes: datetime64[ns](1), int64(3), object(5) memory usage: 20.0+ MB
pcd_lookup = pd.read_csv(
extracted_pcd_lookup_fp,
low_memory=False,
dtype='str',
usecols=['pcds', 'lsoa11cd', 'lsoa11nm', 'ladcd', 'ladnm']
)
pcd_lookup.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2651389 entries, 0 to 2651388 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 pcds object 1 lsoa11cd object 2 ladcd object 3 lsoa11nm object 4 ladnm object dtypes: object(5) memory usage: 101.1+ MB
epc_data.sort_values(by='INSPECTION_DATE', ascending=False, inplace=True)
epc_data.drop_duplicates(subset=['BUILDING_REFERENCE_NUMBER'], keep='first', inplace=True)
epc_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 223903 entries, 210533 to 240321 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 POSTCODE 223903 non-null object 1 BUILDING_REFERENCE_NUMBER 223903 non-null int64 2 CURRENT_ENERGY_RATING 223903 non-null object 3 POTENTIAL_ENERGY_RATING 223903 non-null object 4 CURRENT_ENERGY_EFFICIENCY 223903 non-null int64 5 POTENTIAL_ENERGY_EFFICIENCY 223903 non-null int64 6 INSPECTION_DATE 223903 non-null datetime64[ns] 7 TRANSACTION_TYPE 223903 non-null object 8 CONSTRUCTION_AGE_BAND 199075 non-null object dtypes: datetime64[ns](1), int64(3), object(5) memory usage: 17.1+ MB
epc_data['POSTCODE'] = epc_data['POSTCODE'].str.replace('\s*', '').str.upper()
pcd_lookup['POSTCODE'] = pcd_lookup['pcds'].str.replace('\s*', '').str.upper()
epc_data = epc_data.merge(
pcd_lookup,
on='POSTCODE',
how='inner',
)
epc_data.drop(columns=['POSTCODE', 'pcds'], inplace=True)
epc_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 223818 entries, 0 to 223817 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BUILDING_REFERENCE_NUMBER 223818 non-null int64 1 CURRENT_ENERGY_RATING 223818 non-null object 2 POTENTIAL_ENERGY_RATING 223818 non-null object 3 CURRENT_ENERGY_EFFICIENCY 223818 non-null int64 4 POTENTIAL_ENERGY_EFFICIENCY 223818 non-null int64 5 INSPECTION_DATE 223818 non-null datetime64[ns] 6 TRANSACTION_TYPE 223818 non-null object 7 CONSTRUCTION_AGE_BAND 199055 non-null object 8 lsoa11cd 223818 non-null object 9 ladcd 223818 non-null object 10 lsoa11nm 223818 non-null object 11 ladnm 223818 non-null object dtypes: datetime64[ns](1), int64(3), object(8) memory usage: 22.2+ MB
LADCD = 'E08000035'
epc_data = epc_data[epc_data['ladcd'] == LADCD]
epc_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 223416 entries, 0 to 223817 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BUILDING_REFERENCE_NUMBER 223416 non-null int64 1 CURRENT_ENERGY_RATING 223416 non-null object 2 POTENTIAL_ENERGY_RATING 223416 non-null object 3 CURRENT_ENERGY_EFFICIENCY 223416 non-null int64 4 POTENTIAL_ENERGY_EFFICIENCY 223416 non-null int64 5 INSPECTION_DATE 223416 non-null datetime64[ns] 6 TRANSACTION_TYPE 223416 non-null object 7 CONSTRUCTION_AGE_BAND 199042 non-null object 8 lsoa11cd 223416 non-null object 9 ladcd 223416 non-null object 10 lsoa11nm 223416 non-null object 11 ladnm 223416 non-null object dtypes: datetime64[ns](1), int64(3), object(8) memory usage: 22.2+ MB
epc_data['CURRENT_ENERGY_RATING'].value_counts().sort_index().plot.bar(xlabel='EPC Rating', ylabel='Number of properties')
<AxesSubplot:xlabel='EPC Rating', ylabel='Number of properties'>
epc_data['TRANSACTION_TYPE'].value_counts().plot.barh(xlabel='Reason for EPC Inspection', ylabel='Number of properties')
<AxesSubplot:ylabel='Reason for EPC Inspection'>
epc_data['CONSTRUCTION_AGE_BAND'] = epc_data['CONSTRUCTION_AGE_BAND'] \
.fillna('UNKNOWN') \
.str.replace('England and Wales: ', '') \
.str.replace('before ', 'PRE-') \
.str.replace(' onwards', '-2020')
epc_data['CONSTRUCTION_AGE_BAND'].value_counts().sort_index(ascending=False).plot.barh(xlabel='Construction Age Band', ylabel='Number of properties')
<AxesSubplot:ylabel='Construction Age Band'>
epc_data.sort_values(by='lsoa11cd', inplace=True)
lsoa_averages = pd.DataFrame(index=epc_data['lsoa11cd'].unique())
lsoa_age_band_data = epc_data[['lsoa11cd', 'CONSTRUCTION_AGE_BAND']].pivot_table(index=['lsoa11cd'], columns='CONSTRUCTION_AGE_BAND', aggfunc='size', fill_value=0)
lsoa_age_band_data
CONSTRUCTION_AGE_BAND | 1900-1929 | 1930-1949 | 1950-1966 | 1967-1975 | 1976-1982 | 1983-1990 | 1991-1995 | 1996-2002 | 2003-2006 | 2007-2020 | PRE-1900 | UNKNOWN |
---|---|---|---|---|---|---|---|---|---|---|---|---|
lsoa11cd | ||||||||||||
E01011264 | 3 | 69 | 116 | 41 | 3 | 0 | 1 | 0 | 0 | 0 | 16 | 4 |
E01011265 | 110 | 38 | 14 | 47 | 17 | 34 | 9 | 9 | 56 | 10 | 163 | 168 |
E01011266 | 35 | 83 | 45 | 36 | 11 | 6 | 5 | 8 | 93 | 134 | 12 | 79 |
E01011267 | 22 | 45 | 158 | 44 | 11 | 5 | 10 | 14 | 13 | 12 | 32 | 94 |
E01011268 | 0 | 70 | 203 | 27 | 41 | 29 | 3 | 3 | 1 | 0 | 0 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
E01033031 | 24 | 9 | 88 | 21 | 1 | 0 | 1 | 15 | 62 | 327 | 8 | 332 |
E01033032 | 7 | 8 | 65 | 87 | 5 | 21 | 4 | 44 | 256 | 266 | 2 | 858 |
E01033033 | 0 | 1 | 75 | 7 | 1 | 0 | 1 | 10 | 198 | 252 | 0 | 1 |
E01033034 | 5 | 0 | 0 | 0 | 0 | 6 | 9 | 154 | 287 | 325 | 18 | 221 |
E01033035 | 2 | 2 | 41 | 51 | 0 | 7 | 2 | 20 | 0 | 0 | 0 | 587 |
482 rows × 12 columns
lsoa_epc_rating_data = epc_data[['lsoa11cd', 'CURRENT_ENERGY_RATING']].pivot_table(index=['lsoa11cd'], columns='CURRENT_ENERGY_RATING', aggfunc='size', fill_value=0)
lsoa_epc_rating_data
CURRENT_ENERGY_RATING | A | B | C | D | E | F | G |
---|---|---|---|---|---|---|---|
lsoa11cd | |||||||
E01011264 | 0 | 1 | 54 | 158 | 31 | 7 | 2 |
E01011265 | 0 | 104 | 235 | 197 | 122 | 11 | 6 |
E01011266 | 0 | 47 | 211 | 209 | 71 | 6 | 3 |
E01011267 | 0 | 106 | 150 | 161 | 33 | 7 | 3 |
E01011268 | 0 | 9 | 151 | 167 | 43 | 6 | 3 |
... | ... | ... | ... | ... | ... | ... | ... |
E01033031 | 0 | 239 | 349 | 157 | 111 | 32 | 0 |
E01033032 | 0 | 937 | 541 | 118 | 26 | 1 | 0 |
E01033033 | 0 | 55 | 413 | 73 | 4 | 1 | 0 |
E01033034 | 0 | 210 | 504 | 268 | 40 | 2 | 1 |
E01033035 | 0 | 0 | 192 | 86 | 299 | 94 | 41 |
482 rows × 7 columns
lsoa_averages['AGE_BAND_MODE'] = lsoa_age_band_data.agg(pd.Series.idxmax, axis=1)
lsoa_averages['EPC_RATING_MODE'] = lsoa_epc_rating_data.agg(pd.Series.idxmax, axis=1)
lsoa_averages['ENERGY_EFFICIENCY_MEAN'] = epc_data.groupby('lsoa11cd')['CURRENT_ENERGY_EFFICIENCY'].mean().round().astype('int')
lsoa_averages
AGE_BAND_MODE | EPC_RATING_MODE | ENERGY_EFFICIENCY_MEAN | |
---|---|---|---|
E01011264 | 1950-1966 | D | 62 |
E01011265 | UNKNOWN | C | 67 |
E01011266 | 2007-2020 | C | 66 |
E01011267 | 1950-1966 | D | 69 |
E01011268 | 1950-1966 | D | 65 |
... | ... | ... | ... |
E01033031 | UNKNOWN | C | 70 |
E01033032 | UNKNOWN | B | 79 |
E01033033 | 2007-2020 | C | 75 |
E01033034 | 2007-2020 | C | 73 |
E01033035 | UNKNOWN | E | 52 |
482 rows × 3 columns
lsoa_averages['EPC_RATING_MEAN'] = pd.cut(
lsoa_averages['ENERGY_EFFICIENCY_MEAN'],
bins=[0, 20, 38, 54, 68, 80, 91, float('inf')],
labels=['G', 'F', 'E', 'D', 'C', 'B', 'A']
)
lsoa_averages
AGE_BAND_MODE | EPC_RATING_MODE | ENERGY_EFFICIENCY_MEAN | EPC_RATING_MEAN | |
---|---|---|---|---|
E01011264 | 1950-1966 | D | 62 | D |
E01011265 | UNKNOWN | C | 67 | D |
E01011266 | 2007-2020 | C | 66 | D |
E01011267 | 1950-1966 | D | 69 | C |
E01011268 | 1950-1966 | D | 65 | D |
... | ... | ... | ... | ... |
E01033031 | UNKNOWN | C | 70 | C |
E01033032 | UNKNOWN | B | 79 | C |
E01033033 | 2007-2020 | C | 75 | C |
E01033034 | 2007-2020 | C | 73 | C |
E01033035 | UNKNOWN | E | 52 | E |
482 rows × 4 columns