Imports:
import json, os, zipfile
import pandas as pd
import numpy as np
import geopandas as gpd
Extract EPC data for Leeds only:
full_epc_data = zipfile.ZipFile('data/raw/England-and-Wales-EPC-All.zip', 'r')
lad_code = 'E08000035'
file_to_extract = [x for x in full_epc_data.namelist() if lad_code in x and 'certificates.csv' in x][0]
if os.path.exists(f'data/extracts/{file_to_extract}'):
extracted_data_path = f'data/extracts/{file_to_extract}'
else:
extracted_data_path = full_epc_data.extract(file_to_extract, path='data/extracts/')
Read in EPC data for Leeds:
epc_data = pd.read_csv(
extracted_data_path,
low_memory=False,
keep_default_na=True,
na_values=['INVALID!', 'NO DATA!', 'not recorded', 'blank']
)
epc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 291914 entries, 0 to 291913 Data columns (total 90 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LMK_KEY 291914 non-null object 1 ADDRESS1 291914 non-null object 2 ADDRESS2 118730 non-null object 3 ADDRESS3 13970 non-null object 4 POSTCODE 291914 non-null object 5 BUILDING_REFERENCE_NUMBER 291914 non-null int64 6 CURRENT_ENERGY_RATING 291914 non-null object 7 POTENTIAL_ENERGY_RATING 291914 non-null object 8 CURRENT_ENERGY_EFFICIENCY 291914 non-null int64 9 POTENTIAL_ENERGY_EFFICIENCY 291914 non-null int64 10 PROPERTY_TYPE 291914 non-null object 11 BUILT_FORM 284999 non-null object 12 INSPECTION_DATE 291914 non-null object 13 LOCAL_AUTHORITY 291914 non-null object 14 CONSTITUENCY 291914 non-null object 15 COUNTY 1 non-null object 16 LODGEMENT_DATE 291914 non-null object 17 TRANSACTION_TYPE 291906 non-null object 18 ENVIRONMENT_IMPACT_CURRENT 291914 non-null int64 19 ENVIRONMENT_IMPACT_POTENTIAL 291914 non-null int64 20 ENERGY_CONSUMPTION_CURRENT 291914 non-null int64 21 ENERGY_CONSUMPTION_POTENTIAL 291914 non-null float64 22 CO2_EMISSIONS_CURRENT 291914 non-null float64 23 CO2_EMISS_CURR_PER_FLOOR_AREA 291914 non-null float64 24 CO2_EMISSIONS_POTENTIAL 291914 non-null float64 25 LIGHTING_COST_CURRENT 291914 non-null float64 26 LIGHTING_COST_POTENTIAL 291914 non-null float64 27 HEATING_COST_CURRENT 291914 non-null float64 28 HEATING_COST_POTENTIAL 291914 non-null float64 29 HOT_WATER_COST_CURRENT 291914 non-null float64 30 HOT_WATER_COST_POTENTIAL 291914 non-null float64 31 TOTAL_FLOOR_AREA 291914 non-null float64 32 ENERGY_TARIFF 291403 non-null object 33 MAINS_GAS_FLAG 264109 non-null object 34 FLOOR_LEVEL 232517 non-null object 35 FLAT_TOP_STOREY 74527 non-null object 36 FLAT_STOREY_COUNT 26851 non-null float64 37 MAIN_HEATING_CONTROLS 266346 non-null object 38 MULTI_GLAZE_PROPORTION 263774 non-null float64 39 GLAZED_TYPE 264142 non-null object 40 GLAZED_AREA 265466 non-null object 41 EXTENSION_COUNT 265466 non-null float64 42 NUMBER_HABITABLE_ROOMS 265466 non-null float64 43 NUMBER_HEATED_ROOMS 265466 non-null float64 44 LOW_ENERGY_LIGHTING 281490 non-null float64 45 NUMBER_OPEN_FIREPLACES 285720 non-null float64 46 HOTWATER_DESCRIPTION 291914 non-null object 47 HOT_WATER_ENERGY_EFF 291198 non-null object 48 HOT_WATER_ENV_EFF 291157 non-null object 49 FLOOR_DESCRIPTION 291889 non-null object 50 FLOOR_ENERGY_EFF 16789 non-null object 51 FLOOR_ENV_EFF 16789 non-null object 52 WINDOWS_DESCRIPTION 291799 non-null object 53 WINDOWS_ENERGY_EFF 291646 non-null object 54 WINDOWS_ENV_EFF 291646 non-null object 55 WALLS_DESCRIPTION 291912 non-null object 56 WALLS_ENERGY_EFF 291643 non-null object 57 WALLS_ENV_EFF 291643 non-null object 58 SECONDHEAT_DESCRIPTION 291914 non-null object 59 SHEATING_ENERGY_EFF 0 non-null float64 60 SHEATING_ENV_EFF 0 non-null float64 61 ROOF_DESCRIPTION 291715 non-null object 62 ROOF_ENERGY_EFF 234975 non-null object 63 ROOF_ENV_EFF 234975 non-null object 64 MAINHEAT_DESCRIPTION 291914 non-null object 65 MAINHEAT_ENERGY_EFF 291156 non-null object 66 MAINHEAT_ENV_EFF 291159 non-null object 67 MAINHEATCONT_DESCRIPTION 291914 non-null object 68 MAINHEATC_ENERGY_EFF 291449 non-null object 69 MAINHEATC_ENV_EFF 291449 non-null object 70 LIGHTING_DESCRIPTION 291914 non-null object 71 LIGHTING_ENERGY_EFF 291505 non-null object 72 LIGHTING_ENV_EFF 291612 non-null object 73 MAIN_FUEL 289578 non-null object 74 WIND_TURBINE_COUNT 272060 non-null float64 75 HEAT_LOSS_CORRIDOOR 74527 non-null object 76 UNHEATED_CORRIDOR_LENGTH 32490 non-null float64 77 FLOOR_HEIGHT 111261 non-null float64 78 PHOTO_SUPPLY 161373 non-null float64 79 SOLAR_WATER_HEATING_FLAG 175187 non-null object 80 MECHANICAL_VENTILATION 265466 non-null object 81 ADDRESS 291914 non-null object 82 LOCAL_AUTHORITY_LABEL 291914 non-null object 83 CONSTITUENCY_LABEL 291914 non-null object 84 POSTTOWN 291914 non-null object 85 CONSTRUCTION_AGE_BAND 264008 non-null object 86 LODGEMENT_DATETIME 291914 non-null object 87 TENURE 283010 non-null object 88 FIXED_LIGHTING_OUTLETS_COUNT 114067 non-null float64 89 LOW_ENERGY_FIXED_LIGHT_COUNT 114066 non-null float64 dtypes: float64(26), int64(6), object(58) memory usage: 200.4+ MB
We can see that there is 90 columns in the data, many of which contain lots of 'null' values. Let's limit the data to a subset of columns:
columns_to_keep = [
'POSTCODE',
'BUILDING_REFERENCE_NUMBER',
'CURRENT_ENERGY_RATING',
'POTENTIAL_ENERGY_RATING',
'CURRENT_ENERGY_EFFICIENCY',
'POTENTIAL_ENERGY_EFFICIENCY',
'INSPECTION_DATE',
'LODGEMENT_DATE',
'CONSTRUCTION_AGE_BAND'
]
epc_data.drop(columns=epc_data.columns.difference(columns_to_keep), inplace=True)
epc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 291914 entries, 0 to 291913 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 POSTCODE 291914 non-null object 1 BUILDING_REFERENCE_NUMBER 291914 non-null int64 2 CURRENT_ENERGY_RATING 291914 non-null object 3 POTENTIAL_ENERGY_RATING 291914 non-null object 4 CURRENT_ENERGY_EFFICIENCY 291914 non-null int64 5 POTENTIAL_ENERGY_EFFICIENCY 291914 non-null int64 6 INSPECTION_DATE 291914 non-null object 7 LODGEMENT_DATE 291914 non-null object 8 CONSTRUCTION_AGE_BAND 264008 non-null object dtypes: int64(3), object(6) memory usage: 20.0+ MB
Grab ONS Postcode Lookup file (2020):
![ ! -f data/ONS-Postcode-Lookup.zip ] && wget https://www.arcgis.com/sharing/rest/content/items/b982ad8161dd4fc29bbe76038ae32c10/data -O 'data/ONS-Postcode-Lookup.zip'
Read in the ONS Postcode Lookup file & drop unnecessary columns:
pcd_lookup = pd.read_csv('data/ONS-Postcode-Lookup.zip', compression='zip', usecols=['pcds', 'lsoa11cd', 'ladcd'], low_memory=False)
pcd_lookup.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2647046 entries, 0 to 2647045 Data columns (total 3 columns): # Column Dtype --- ------ ----- 0 pcds object 1 lsoa11cd object 2 ladcd object dtypes: object(3) memory usage: 60.6+ MB
Merge the EPC data with the ONS postcode lookup (removing any rows with invalid postcodes):
epc_data['POSTCODE'] = epc_data['POSTCODE'].str.replace('\s*', '').str.upper()
pcd_lookup['POSTCODE'] = pcd_lookup['pcds'].str.replace('\s*', '').str.upper()
count = len(epc_data)
epc_data = epc_data.merge(pcd_lookup, on='POSTCODE', how='inner')
print(f'Dropped {count - len(epc_data)} rows with invalid postcodes')
Dropped 125 rows with invalid postcodes
Remove entries where the LSOA isn't in Leeds
leeds_lsoa_codes = pcd_lookup[pcd_lookup['ladcd'] == lad_code]['lsoa11cd'].drop_duplicates()
epc_data = epc_data[epc_data['lsoa11cd'].isin(leeds_lsoa_codes)]
Sort by EPC date and remove outdated EPC's (ie. if a property has multiple EPC's)
epc_data['LODGEMENT_DATE'] = pd.to_datetime(epc_data['LODGEMENT_DATE'], infer_datetime_format=True, errors='coerce')
epc_data['INSPECTION_DATE'] = pd.to_datetime(epc_data['INSPECTION_DATE'], infer_datetime_format=True, errors='coerce')
epc_data.sort_values(by=['INSPECTION_DATE', 'LODGEMENT_DATE'], ascending=False, inplace=True)
count = len(epc_data)
epc_data.drop_duplicates(subset=['POSTCODE', 'BUILDING_REFERENCE_NUMBER'], keep='first', ignore_index=True, inplace=True)
print(f"Dropped {count - len(epc_data)} outdated EPC's")
Dropped 67987 outdated EPC's
lsoa_summary = pd.DataFrame(index=leeds_lsoa_codes)
Calculate average (mode) current & potential EPC ratings by LSOA:
mode = lambda x: pd.Series.mode(x)[0]
lsoa_summary['CURRENT_EPC_RATING_MODE'] = epc_data.groupby('lsoa11cd')['CURRENT_ENERGY_RATING'].agg(mode)
lsoa_summary['POTENTIAL_EPC_RATING_MODE'] = epc_data.groupby('lsoa11cd')['POTENTIAL_ENERGY_RATING'].agg(mode)
lsoa_summary.info()
<class 'pandas.core.frame.DataFrame'> Index: 482 entries, E01011527 to E01011500 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CURRENT_EPC_RATING_MODE 482 non-null object 1 POTENTIAL_EPC_RATING_MODE 482 non-null object dtypes: object(2) memory usage: 11.3+ KB
Replace blank age band values with 'UNKNOWN':
epc_data['CONSTRUCTION_AGE_BAND'].fillna('UNKNOWN', inplace=True)
Determine what the current Age Band values are in the EPC data:
current_age_band_values = epc_data['CONSTRUCTION_AGE_BAND'].unique().tolist()
current_age_band_values.sort()
print('Current age band values: \n')
print('\n'.join(current_age_band_values))
Current age band values: England and Wales: 1900-1929 England and Wales: 1930-1949 England and Wales: 1950-1966 England and Wales: 1967-1975 England and Wales: 1976-1982 England and Wales: 1983-1990 England and Wales: 1991-1995 England and Wales: 1996-2002 England and Wales: 2003-2006 England and Wales: 2007 onwards England and Wales: before 1900 UNKNOWN
Standardise the naming of these age bands:
epc_data['CONSTRUCTION_AGE_BAND'] = epc_data['CONSTRUCTION_AGE_BAND'].str.lstrip('England and Wales: ') \
.str.replace('before ', '0-') \
.str.replace(' onwards', '-2020')
standardised_epc_age_bands = epc_data['CONSTRUCTION_AGE_BAND'].unique().tolist()
standardised_epc_age_bands.sort()
print(f'New age band values: \n')
print('\n'.join(standardised_epc_age_bands))
New age band values: 0-1900 1900-1929 1930-1949 1950-1966 1967-1975 1976-1982 1983-1990 1991-1995 1996-2002 2003-2006 2007-2020 UNKNOWN
Let's look at the total number of properties in each age band:
epc_data['CONSTRUCTION_AGE_BAND'].value_counts().sort_index().plot.bar(xlabel='Age Band', ylabel='Number of properties')
<AxesSubplot:xlabel='Age Band', ylabel='Number of properties'>
Now let's calculate the number of properties in each age band by LSOA (from the EPC data):
epc_age_bands_by_lsoa = epc_data[['lsoa11cd', 'CONSTRUCTION_AGE_BAND']].fillna('UNKNOWN').pivot_table(index=['lsoa11cd'], columns='CONSTRUCTION_AGE_BAND', aggfunc='size', fill_value=0)
epc_age_bands_by_lsoa
CONSTRUCTION_AGE_BAND | 0-1900 | 1900-1929 | 1930-1949 | 1950-1966 | 1967-1975 | 1976-1982 | 1983-1990 | 1991-1995 | 1996-2002 | 2003-2006 | 2007-2020 | UNKNOWN |
---|---|---|---|---|---|---|---|---|---|---|---|---|
lsoa11cd | ||||||||||||
E01011264 | 16 | 3 | 69 | 116 | 41 | 3 | 0 | 1 | 0 | 0 | 0 | 4 |
E01011265 | 163 | 110 | 38 | 14 | 47 | 17 | 34 | 9 | 9 | 56 | 10 | 168 |
E01011266 | 13 | 34 | 83 | 45 | 36 | 11 | 6 | 5 | 8 | 93 | 134 | 79 |
E01011267 | 32 | 22 | 45 | 158 | 44 | 11 | 5 | 10 | 14 | 13 | 12 | 94 |
E01011268 | 0 | 0 | 70 | 203 | 27 | 41 | 29 | 3 | 3 | 1 | 0 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
E01033031 | 8 | 24 | 9 | 88 | 21 | 1 | 0 | 1 | 15 | 62 | 327 | 332 |
E01033032 | 2 | 7 | 8 | 65 | 87 | 5 | 21 | 4 | 44 | 255 | 267 | 858 |
E01033033 | 0 | 0 | 1 | 75 | 7 | 1 | 0 | 1 | 10 | 198 | 252 | 1 |
E01033034 | 18 | 5 | 0 | 0 | 0 | 0 | 6 | 9 | 154 | 287 | 325 | 221 |
E01033035 | 0 | 2 | 2 | 41 | 51 | 0 | 7 | 2 | 20 | 0 | 0 | 587 |
482 rows × 12 columns
Calculate total number of properties and mode age band by LSOA:
lsoa_summary[['EPC_PROPERTY_COUNT', 'EPC_PROPERTY_AGE_BAND_MODE']] = epc_age_bands_by_lsoa.agg(['sum', lambda row: row.idxmax(axis=1)], axis=1)
lsoa_summary
CURRENT_EPC_RATING_MODE | POTENTIAL_EPC_RATING_MODE | EPC_PROPERTY_COUNT | EPC_PROPERTY_AGE_BAND_MODE | |
---|---|---|---|---|
lsoa11cd | ||||
E01011527 | D | B | 352 | 1967-1975 |
E01011589 | D | B | 461 | 0-1900 |
E01011457 | D | C | 337 | 0-1900 |
E01011521 | D | B | 589 | 2003-2006 |
E01011529 | D | C | 285 | 1950-1966 |
... | ... | ... | ... | ... |
E01011537 | D | B | 479 | 1930-1949 |
E01011551 | D | B | 421 | 1900-1929 |
E01011499 | C | B | 940 | UNKNOWN |
E01032496 | C | B | 601 | 2003-2006 |
E01011500 | D | B | 355 | 1950-1966 |
482 rows × 4 columns
Now let's look at the number of LSOA's per average age band:
lsoa_count_by_mode_age_band = lsoa_summary.groupby('EPC_PROPERTY_AGE_BAND_MODE').agg('size')
lsoa_count_by_mode_age_band.sort_index().plot.bar(xlabel='Average Property Age Band', ylabel="Number of LSOA's")
<AxesSubplot:xlabel='Average Property Age Band', ylabel="Number of LSOA's">
Get VOA 2020 Housing Stock data to compare:
voa_housing_stock_data = pd.read_csv('https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/920382/Table_CTSOP4_1_2020.csv', encoding='cp1252', low_memory=False)
voa_housing_stock_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 383404 entries, 0 to 383403 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 GEOGRAPHY 383404 non-null object 1 BA_CODE 3053 non-null float64 2 ECODE 383404 non-null object 3 AREA_NAME 383404 non-null object 4 band 383368 non-null object 5 BP_PRE_1900 383368 non-null object 6 BP_1900_1918 383368 non-null object 7 BP_1919_1929 383368 non-null object 8 BP_1930_1939 383368 non-null object 9 BP_1945_1954 383368 non-null object 10 BP_1955_1964 383368 non-null object 11 BP_1965_1972 383368 non-null object 12 BP_1973_1982 383368 non-null object 13 BP_1983_1992 383368 non-null object 14 BP_1993_1999 383368 non-null object 15 BP_2000_2008 383368 non-null object 16 BP_2009_2011 383368 non-null object 17 BP_2012_2014 383368 non-null object 18 BP_2015_2017 383368 non-null object 19 BP_2018_2020 383368 non-null object 20 BP_UNKNOWN 383368 non-null object 21 ALL_PROPERTIES 383368 non-null object dtypes: float64(1), object(21) memory usage: 64.4+ MB
Filter to Leeds LSOA's only:
voa_housing_stock_data = voa_housing_stock_data[(voa_housing_stock_data['band'] == 'All') & (voa_housing_stock_data['ECODE'].isin(leeds_lsoa_codes))]
voa_housing_stock_data.set_index('ECODE', inplace=True)
voa_housing_stock_data.index.name = 'lsoa11cd'
Drop unnecessary columns and standardise age band naming:
columns_to_keep = [x for x in voa_housing_stock_data.columns if x.startswith('BP')]
voa_housing_stock_data.drop(columns=voa_housing_stock_data.columns.difference(columns_to_keep), inplace=True)
voa_housing_stock_data.rename(columns=lambda x: x.replace('BP_', '').replace('_', '-'), inplace=True)
Convert column datatypes to numerical:
voa_housing_stock_data = voa_housing_stock_data.apply(pd.to_numeric, errors='coerce').fillna(0)
Calculate total properties and mode age band by LSOA:
lsoa_summary[['VOA_PROPERTY_COUNT', 'VOA_PROPERTY_AGE_BAND_MODE']] = voa_housing_stock_data.agg(['sum', lambda row: row.idxmax(axis=1)], axis=1)
lsoa_summary
CURRENT_EPC_RATING_MODE | POTENTIAL_EPC_RATING_MODE | EPC_PROPERTY_COUNT | EPC_PROPERTY_AGE_BAND_MODE | VOA_PROPERTY_COUNT | VOA_PROPERTY_AGE_BAND_MODE | |
---|---|---|---|---|---|---|
lsoa11cd | ||||||
E01011527 | D | B | 352 | 1967-1975 | 780 | 1965-1972 |
E01011589 | D | B | 461 | 0-1900 | 730 | PRE-1900 |
E01011457 | D | C | 337 | 0-1900 | 650 | PRE-1900 |
E01011521 | D | B | 589 | 2003-2006 | 1040 | 2000-2008 |
E01011529 | D | C | 285 | 1950-1966 | 590 | 1973-1982 |
... | ... | ... | ... | ... | ... | ... |
E01011537 | D | B | 479 | 1930-1949 | 1010 | 1965-1972 |
E01011551 | D | B | 421 | 1900-1929 | 750 | PRE-1900 |
E01011499 | C | B | 940 | UNKNOWN | 1290 | 2000-2008 |
E01032496 | C | B | 601 | 2003-2006 | 970 | 2000-2008 |
E01011500 | D | B | 355 | 1950-1966 | 720 | 1955-1964 |
482 rows × 6 columns
lsoa_summary.info()
<class 'pandas.core.frame.DataFrame'> Index: 482 entries, E01011527 to E01011500 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CURRENT_EPC_RATING_MODE 482 non-null object 1 POTENTIAL_EPC_RATING_MODE 482 non-null object 2 EPC_PROPERTY_COUNT 482 non-null object 3 EPC_PROPERTY_AGE_BAND_MODE 482 non-null object 4 VOA_PROPERTY_COUNT 482 non-null object 5 VOA_PROPERTY_AGE_BAND_MODE 482 non-null object dtypes: object(6) memory usage: 26.4+ KB
lsoa_summary['EPC_PROPERTY_COUNT'] = pd.to_numeric(lsoa_summary['EPC_PROPERTY_COUNT'], downcast='integer')
lsoa_summary['VOA_PROPERTY_COUNT'] = pd.to_numeric(lsoa_summary['VOA_PROPERTY_COUNT'], downcast='integer')
Calculate percentage of total properties with a published EPC:
lsoa_summary['PROPERTIES_WITH_EPC_PERC'] = (lsoa_summary['EPC_PROPERTY_COUNT'] / lsoa_summary['VOA_PROPERTY_COUNT']).round(2)
lsoa_summary
CURRENT_EPC_RATING_MODE | POTENTIAL_EPC_RATING_MODE | EPC_PROPERTY_COUNT | EPC_PROPERTY_AGE_BAND_MODE | VOA_PROPERTY_COUNT | VOA_PROPERTY_AGE_BAND_MODE | PROPERTIES_WITH_EPC_PERC | |
---|---|---|---|---|---|---|---|
lsoa11cd | |||||||
E01011527 | D | B | 352 | 1967-1975 | 780 | 1965-1972 | 0.45 |
E01011589 | D | B | 461 | 0-1900 | 730 | PRE-1900 | 0.63 |
E01011457 | D | C | 337 | 0-1900 | 650 | PRE-1900 | 0.52 |
E01011521 | D | B | 589 | 2003-2006 | 1040 | 2000-2008 | 0.57 |
E01011529 | D | C | 285 | 1950-1966 | 590 | 1973-1982 | 0.48 |
... | ... | ... | ... | ... | ... | ... | ... |
E01011537 | D | B | 479 | 1930-1949 | 1010 | 1965-1972 | 0.47 |
E01011551 | D | B | 421 | 1900-1929 | 750 | PRE-1900 | 0.56 |
E01011499 | C | B | 940 | UNKNOWN | 1290 | 2000-2008 | 0.73 |
E01032496 | C | B | 601 | 2003-2006 | 970 | 2000-2008 | 0.62 |
E01011500 | D | B | 355 | 1950-1966 | 720 | 1955-1964 | 0.49 |
482 rows × 7 columns
Sort by LSOA and save as CSV file:
lsoa_summary.sort_index().to_csv('output/leeds-lsoa-summary.csv', index_label='lsoa11cd')
Grab the LSOA 2011 Boundaries Shapefile from the ONS and filter to Leeds only:
lsoa_geo = gpd.read_file('https://opendata.arcgis.com/datasets/c892586698ad4d268f9288f1df20ab77_0.zip?outSR=%7B%22latestWkid%22%3A27700%2C%22wkid%22%3A27700%7D')
lsoa_geo = lsoa_geo[lsoa_geo['LSOA11CD'].isin(leeds_lsoa_codes)]
lsoa_geo.drop(columns=lsoa_geo.columns.difference(['LSOA11CD', 'LSOA11NM', 'geometry']), inplace=True)
lsoa_geo.to_crs(epsg=4326, inplace=True)
Merge the LSOA boundaries table with our LSOA summary table to create a geographic summary:
lsoa_geo = lsoa_geo.merge(lsoa_summary, left_on='LSOA11CD', right_on='lsoa11cd', how='inner')
lsoa_geo.info()
<class 'geopandas.geodataframe.GeoDataFrame'> Int64Index: 482 entries, 0 to 481 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LSOA11CD 482 non-null object 1 LSOA11NM 482 non-null object 2 geometry 482 non-null geometry 3 CURRENT_EPC_RATING_MODE 482 non-null object 4 POTENTIAL_EPC_RATING_MODE 482 non-null object 5 EPC_PROPERTY_COUNT 482 non-null int16 6 EPC_PROPERTY_AGE_BAND_MODE 482 non-null object 7 VOA_PROPERTY_COUNT 482 non-null int16 8 VOA_PROPERTY_AGE_BAND_MODE 482 non-null object 9 PROPERTIES_WITH_EPC_PERC 482 non-null float64 dtypes: float64(1), geometry(1), int16(2), object(6) memory usage: 35.8+ KB
Save as GeoJSON:
lsoa_geo.to_file('output/leeds-lsoa-summary.geojson', driver='GeoJSON')