import geopandas as gpd
import pandas as pd
import numpy as np
Train test split
Split the data into multiple train / test sets, so that the prediction accuracy of the model can be assest against every country.
= ['Germany', 'Poland', 'Czechia', 'Slovakia', 'Austria']
country_names = ['Poland', 'Germany', 'Austria', 'Czechia'], ['Slovakia']
TRAINING1, TESTING1 = ['Slovakia', 'Germany', 'Austria', 'Czechia'], ['Poland']
TRAINING2, TESTING2 = ['Poland', 'Slovakia', 'Austria', 'Czechia'], ['Germany']
TRAINING3, TESTING3 = ['Poland', 'Germany', 'Slovakia', 'Czechia'], ['Austria']
TRAINING4, TESTING4 = ['Poland', 'Germany', 'Austria', 'Slovakia'], ['Czechia'] TRAINING5, TESTING5
= '/data/uscuni-eurofab-overture/' coredir
= coredir
regions_datadir = f'{coredir}processed_data/tessellations/'
tessellations_dir = f'{coredir}processed_data/buildings/'
buildings_dir
= f'{coredir}processed_data/chars/'
data_dir = f'{coredir}processed_data/target_clusters/'
target_dir = f'{coredir}processed_data/hexagons/'
hex_dir
if coredir == '/data/uscuni-eurofab/':
= gpd.read_parquet(
region_hulls + "regions/" + "ms_ce_region_hulls.parquet"
regions_datadir
)else:
= gpd.read_parquet(
region_hulls + "regions/" + "ov_ce_region_hulls.parquet"
regions_datadir
)
= 'gaussian'
kernel = 3
spatial_lag =-1 bandwith_type
= 'only_unweighted_lag_overture' char_type
Download a shapefile of all EU countries, if not present.
# download countries
# !wget https://gisco-services.ec.europa.eu/distribution/v2/countries/geojson/CNTR_RG_01M_2024_3035.geojson
Assign regions to countries
= gpd.read_file('CNTR_RG_01M_2024_3035.geojson').to_crs(epsg=3035)
countries = countries[countries['NAME_ENGL'].isin(country_names)] country_polygons
= country_polygons.sindex.query(region_hulls.geometry, predicate='intersects')
region_idxs, country_idxs = region_hulls.iloc[region_idxs].intersection(country_polygons.iloc[country_idxs], align=False).area intersections
country_polygons
CNTR_ID | CNTR_NAME | NAME_ENGL | NAME_FREN | ISO3_CODE | SVRG_UN | CAPT | EU_STAT | EFTA_STAT | CC_STAT | NAME_GERM | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | AT | Österreich | Austria | Autriche | AUT | UN Member State | Vienna | T | F | F | Österreich | MULTIPOLYGON (((4354847.685 2714710.627, 43552... |
55 | CZ | Česká Republika | Czechia | Tchéquie | CZE | UN Member State | Prague | T | F | F | Tschechien | MULTIPOLYGON (((4624842.426 3112217.365, 46255... |
56 | DE | Deutschland | Germany | Allemagne | DEU | UN Member State | Berlin | T | F | F | Deutschland | MULTIPOLYGON (((4355225.354 2715902.995, 43548... |
169 | PL | Polska | Poland | Pologne | POL | UN Member State | Warsaw | T | F | F | Polen | MULTIPOLYGON (((4852825.195 3556096.333, 48551... |
191 | SK | Slovensko | Slovakia | Slovaquie | SVK | UN Member State | Bratislava | T | F | F | Slowakei | MULTIPOLYGON (((5003133.924 2988592.038, 50037... |
= gpd.GeoDataFrame(
intersection_df
{'region_id': region_hulls.index[region_idxs].values,
'country': country_polygons.iloc[country_idxs, 2].values,
'intersection_area': intersections.values,
'geometry': region_hulls.iloc[region_idxs, 0].values
},=region_hulls.crs
crs
)
= intersection_df.sort_values('intersection_area', ascending=False)
intersection_df = intersection_df[~intersection_df.region_id.duplicated()].sort_values('region_id')
intersection_df assert (intersection_df.region_id == region_hulls.index).all()
def combine_regions_data(selected_regions, char_type='primary'):
"""Combine the morphometric data, target labels and hexagons from all of the 'selected_regions' into a single dataframe for model training."""
= []
all_data = []
all_labels = []
all_hexagons
for trid in selected_regions:
if char_type == 'weighted_lag':
= pd.read_parquet(f'/data/uscuni-eurofab/processed_data/chars/lag_chars_{trid}_{kernel}_{spatial_lag}_{bandwith_type}.parquet')
data = data.astype(np.float32)
data = data.index.astype(np.int32)
data.index elif char_type == 'unweighted_lag':
= pd.read_parquet(f'/data/uscuni-eurofab/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data = data.index.astype(np.int32)
data.index elif char_type == 'primary':
= pd.read_parquet(f'{data_dir}primary_chars_{trid}.parquet')
data elif char_type == 'only_unweighted_lag':
= pd.read_parquet(f'/data/uscuni-eurofab/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data = data.index.astype(np.int32)
data.index = pd.DataFrame(data.loc[:, data.columns.str.contains('_85')].values - data.loc[:, data.columns.str.contains('_15')].values,
iqr =data.index,
index=data.columns[~data.columns.str.contains('_')] + '_iqr')
columns= data.loc[:, data.columns.str.contains('_median')]
median = pd.merge(iqr, median, left_index=True, right_index=True)
data
elif char_type == 'primary_overture':
# same as primary data, but add a data source column from the building parquet
= pd.read_parquet(f'{data_dir}primary_chars_{trid}.parquet')
data = pd.read_parquet(f"{data_dir}buildings_chars_{trid}.parquet", columns=['source', 'class'])
data_source 'source'] = data_source['source']
data[elif char_type == 'unweighted_lag_overture':
# same as unweighted lag, but add a data source column from the building parquet and only save the median
= pd.read_parquet(f'/data/uscuni-eurofab-overture/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data = data.index.astype(np.int32)
data.index = pd.read_parquet(f"{data_dir}buildings_chars_{trid}.parquet", columns=['source', 'class'])
data_source 'source'] = data_source['source']
data[= data.loc[:, ~data.columns.str.contains('_') | data.columns.str.contains('_median')]
data elif char_type == 'only_unweighted_lag_overture':
# keep only median and iqr from the lag, no primary chars
= pd.read_parquet(f'/data/uscuni-eurofab-overture/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data = data.index.astype(np.int32)
data.index = pd.DataFrame(data.loc[:, data.columns.str.contains('_85')].values - data.loc[:, data.columns.str.contains('_15')].values,
iqr =data.index,
index=data.columns[~data.columns.str.contains('_')] + '_iqr')
columns= data.loc[:, data.columns.str.contains('_median')]
median = pd.merge(iqr, median, left_index=True, right_index=True)
data
else:
raise Exception('Datatype does not exist')
= pd.read_parquet(f'{target_dir}{trid}_target.pq').set_index('index')
targets = pd.read_parquet(f'{hex_dir}{trid}_hexagon.pq').set_index('index')
hexagons
# drop empty tess to save some memory
= targets[targets.index > -1]
targets
= data.index.join(targets.index, how='inner').join(hexagons.index, how='inner')
common_index = data.loc[common_index]
data = targets.loc[common_index]
targets = hexagons.loc[common_index]
hexagons
# record region_id in the index
= str(trid) + '_' + common_index.astype(str)
common_index
= data.set_index(common_index)
data = targets.set_index(common_index)
targets = hexagons.set_index(common_index)
hexagons
all_data.append(data)
all_labels.append(targets)
all_hexagons.append(hexagons)
= pd.concat(all_data)
all_data = pd.concat(all_labels)
all_labels = pd.concat(all_hexagons)
all_hexagons
return all_data, all_labels, all_hexagons
Generate the final country level train/test datasets.
%%time
= TRAINING1
training_countries = TESTING1
testing_countries = 1
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/training_data1.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_labels1.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons1.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/testing_data1.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels1.pq') all_labels.to_parquet(
CPU times: user 8min 10s, sys: 4min 31s, total: 12min 42s
Wall time: 3min 49s
%%time
= TRAINING2
training_countries = TESTING2
testing_countries = 2
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
CPU times: user 7min 57s, sys: 4min 35s, total: 12min 32s
Wall time: 3min 49s
%%time
= TRAINING3
training_countries = TESTING3
testing_countries = 3
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
CPU times: user 7min 55s, sys: 4min 3s, total: 11min 58s
Wall time: 3min 34s
%%time
= TRAINING4
training_countries = TESTING4
testing_countries = 4
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
CPU times: user 8min 2s, sys: 4min 21s, total: 12min 23s
Wall time: 3min 57s
%%time
= TRAINING5
training_countries = TESTING5
testing_countries = 5
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions, char_type=char_type)
all_data, all_labels, all_hexagons
f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
CPU times: user 8min 4s, sys: 4min 19s, total: 12min 23s
Wall time: 3min 51s
%%time
# 6 is the insample model without spatial grouping (hexagons)
= combine_regions_data(region_hulls.index.values, char_type=char_type)
all_data, all_labels, all_hexagons
from sklearn.model_selection import StratifiedKFold
= StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
gkf = gkf.split(
splits
all_data,=all_labels.final_without_noise,
y
)= np.empty(len(all_data), dtype=float)
split_label for i, (train, test) in enumerate(splits):
= i
split_label[test]
= 6
i
# assign four of the groups as training data
= split_label != 0
train f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_data.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_labels.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.loc[train].to_parquet(
# assign one of the groups as test data
= split_label == 0
test f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_data.loc[test].to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq') all_labels.loc[test].to_parquet(
CPU times: user 11min 53s, sys: 7min 9s, total: 19min 3s
Wall time: 6min 55s
%%time
# 7 is the insample model with spatial grouping (hexagons)
= combine_regions_data(region_hulls.index.values, char_type=char_type)
all_data, all_labels, all_hexagons
from sklearn.model_selection import StratifiedGroupKFold
= StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=123)
gkf = gkf.split(
splits
all_data,=all_labels.final_without_noise,
y=all_hexagons.hexagons,
groups
)= np.empty(len(all_data), dtype=float)
split_label for i, (train, test) in enumerate(splits):
= i
split_label[test]
= 7
i
# assign four of the groups as training data
= split_label != 0
train f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_data.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_labels.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.loc[train].to_parquet(
# assign one of the groups as test data
= split_label == 0
test f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_data.loc[test].to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq') all_labels.loc[test].to_parquet(
CPU times: user 13min 19s, sys: 6min 31s, total: 19min 51s
Wall time: 7min 56s
Plot testing regions
= TRAINING4
training_countries = TESTING4
testing_countries = 4
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions region_hulls.loc[testing_regions].explore()
Make this Notebook Trusted to load map: File -> Trust Notebook