import geopandas as gpd
import pandas as pd
import numpy as npTrain test split
Split the data into multiple train / test sets, so that the prediction accuracy of the model can be assest against every country.
country_names = ['Germany', 'Poland', 'Czechia', 'Slovakia', 'Austria']
TRAINING1, TESTING1 = ['Poland', 'Germany', 'Austria', 'Czechia'], ['Slovakia']
TRAINING2, TESTING2 = ['Slovakia', 'Germany', 'Austria', 'Czechia'], ['Poland']
TRAINING3, TESTING3 = ['Poland', 'Slovakia', 'Austria', 'Czechia'], ['Germany']
TRAINING4, TESTING4 = ['Poland', 'Germany', 'Slovakia', 'Czechia'], ['Austria']
TRAINING5, TESTING5 = ['Poland', 'Germany', 'Austria', 'Slovakia'], ['Czechia']coredir = '/data/uscuni-eurofab-overture/'regions_datadir = coredir
tessellations_dir = f'{coredir}processed_data/tessellations/'
buildings_dir = f'{coredir}processed_data/buildings/'
data_dir = f'{coredir}processed_data/chars/'
target_dir = f'{coredir}processed_data/target_clusters/'
hex_dir = f'{coredir}processed_data/hexagons/'
if coredir == '/data/uscuni-eurofab/':
region_hulls = gpd.read_parquet(
regions_datadir + "regions/" + "ms_ce_region_hulls.parquet"
)
else:
region_hulls = gpd.read_parquet(
regions_datadir + "regions/" + "ov_ce_region_hulls.parquet"
)
kernel = 'gaussian'
spatial_lag = 3
bandwith_type=-1char_type = 'only_unweighted_lag_overture'Download a shapefile of all EU countries, if not present.
# download countries
# !wget https://gisco-services.ec.europa.eu/distribution/v2/countries/geojson/CNTR_RG_01M_2024_3035.geojsonAssign regions to countries
countries = gpd.read_file('CNTR_RG_01M_2024_3035.geojson').to_crs(epsg=3035)
country_polygons = countries[countries['NAME_ENGL'].isin(country_names)]region_idxs, country_idxs = country_polygons.sindex.query(region_hulls.geometry, predicate='intersects')
intersections = region_hulls.iloc[region_idxs].intersection(country_polygons.iloc[country_idxs], align=False).areacountry_polygons| CNTR_ID | CNTR_NAME | NAME_ENGL | NAME_FREN | ISO3_CODE | SVRG_UN | CAPT | EU_STAT | EFTA_STAT | CC_STAT | NAME_GERM | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | AT | Österreich | Austria | Autriche | AUT | UN Member State | Vienna | T | F | F | Österreich | MULTIPOLYGON (((4354847.685 2714710.627, 43552... |
| 55 | CZ | Česká Republika | Czechia | Tchéquie | CZE | UN Member State | Prague | T | F | F | Tschechien | MULTIPOLYGON (((4624842.426 3112217.365, 46255... |
| 56 | DE | Deutschland | Germany | Allemagne | DEU | UN Member State | Berlin | T | F | F | Deutschland | MULTIPOLYGON (((4355225.354 2715902.995, 43548... |
| 169 | PL | Polska | Poland | Pologne | POL | UN Member State | Warsaw | T | F | F | Polen | MULTIPOLYGON (((4852825.195 3556096.333, 48551... |
| 191 | SK | Slovensko | Slovakia | Slovaquie | SVK | UN Member State | Bratislava | T | F | F | Slowakei | MULTIPOLYGON (((5003133.924 2988592.038, 50037... |
intersection_df = gpd.GeoDataFrame(
{
'region_id': region_hulls.index[region_idxs].values,
'country': country_polygons.iloc[country_idxs, 2].values,
'intersection_area': intersections.values,
'geometry': region_hulls.iloc[region_idxs, 0].values
},
crs=region_hulls.crs
)
intersection_df = intersection_df.sort_values('intersection_area', ascending=False)
intersection_df = intersection_df[~intersection_df.region_id.duplicated()].sort_values('region_id')
assert (intersection_df.region_id == region_hulls.index).all()def combine_regions_data(selected_regions, char_type='primary'):
"""Combine the morphometric data, target labels and hexagons from all of the 'selected_regions' into a single dataframe for model training."""
all_data = []
all_labels = []
all_hexagons = []
for trid in selected_regions:
if char_type == 'weighted_lag':
data = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/chars/lag_chars_{trid}_{kernel}_{spatial_lag}_{bandwith_type}.parquet')
data = data.astype(np.float32)
data.index = data.index.astype(np.int32)
elif char_type == 'unweighted_lag':
data = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data.index = data.index.astype(np.int32)
elif char_type == 'primary':
data = pd.read_parquet(f'{data_dir}primary_chars_{trid}.parquet')
elif char_type == 'only_unweighted_lag':
data = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data.index = data.index.astype(np.int32)
iqr = pd.DataFrame(data.loc[:, data.columns.str.contains('_85')].values - data.loc[:, data.columns.str.contains('_15')].values,
index=data.index,
columns=data.columns[~data.columns.str.contains('_')] + '_iqr')
median = data.loc[:, data.columns.str.contains('_median')]
data = pd.merge(iqr, median, left_index=True, right_index=True)
elif char_type == 'primary_overture':
# same as primary data, but add a data source column from the building parquet
data = pd.read_parquet(f'{data_dir}primary_chars_{trid}.parquet')
data_source = pd.read_parquet(f"{data_dir}buildings_chars_{trid}.parquet", columns=['source', 'class'])
data['source'] = data_source['source']
elif char_type == 'unweighted_lag_overture':
# same as unweighted lag, but add a data source column from the building parquet and only save the median
data = pd.read_parquet(f'/data/uscuni-eurofab-overture/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data.index = data.index.astype(np.int32)
data_source = pd.read_parquet(f"{data_dir}buildings_chars_{trid}.parquet", columns=['source', 'class'])
data['source'] = data_source['source']
data = data.loc[:, ~data.columns.str.contains('_') | data.columns.str.contains('_median')]
elif char_type == 'only_unweighted_lag_overture':
# keep only median and iqr from the lag, no primary chars
data = pd.read_parquet(f'/data/uscuni-eurofab-overture/processed_data/chars/lag_chars_{trid}_unweighted_{spatial_lag}.parquet')
data = data.astype(np.float32)
data.index = data.index.astype(np.int32)
iqr = pd.DataFrame(data.loc[:, data.columns.str.contains('_85')].values - data.loc[:, data.columns.str.contains('_15')].values,
index=data.index,
columns=data.columns[~data.columns.str.contains('_')] + '_iqr')
median = data.loc[:, data.columns.str.contains('_median')]
data = pd.merge(iqr, median, left_index=True, right_index=True)
else:
raise Exception('Datatype does not exist')
targets = pd.read_parquet(f'{target_dir}{trid}_target.pq').set_index('index')
hexagons = pd.read_parquet(f'{hex_dir}{trid}_hexagon.pq').set_index('index')
# drop empty tess to save some memory
targets = targets[targets.index > -1]
common_index = data.index.join(targets.index, how='inner').join(hexagons.index, how='inner')
data = data.loc[common_index]
targets = targets.loc[common_index]
hexagons = hexagons.loc[common_index]
# record region_id in the index
common_index = str(trid) + '_' + common_index.astype(str)
data = data.set_index(common_index)
targets = targets.set_index(common_index)
hexagons = hexagons.set_index(common_index)
all_data.append(data)
all_labels.append(targets)
all_hexagons.append(hexagons)
all_data = pd.concat(all_data)
all_labels = pd.concat(all_labels)
all_hexagons = pd.concat(all_hexagons)
return all_data, all_labels, all_hexagonsGenerate the final country level train/test datasets.
%%time
training_countries = TRAINING1
testing_countries = TESTING1
i = 1
training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
all_data, all_labels, all_hexagons = combine_regions_data(training_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_data1.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_labels1.pq')
all_hexagons.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons1.pq')
all_data, all_labels, all_hexagons = combine_regions_data(testing_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_data1.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels1.pq')CPU times: user 8min 10s, sys: 4min 31s, total: 12min 42s
Wall time: 3min 49s
%%time
training_countries = TRAINING2
testing_countries = TESTING2
i = 2
training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
all_data, all_labels, all_hexagons = combine_regions_data(training_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_data, all_labels, all_hexagons = combine_regions_data(testing_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq')CPU times: user 7min 57s, sys: 4min 35s, total: 12min 32s
Wall time: 3min 49s
%%time
training_countries = TRAINING3
testing_countries = TESTING3
i = 3
training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
all_data, all_labels, all_hexagons = combine_regions_data(training_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_data, all_labels, all_hexagons = combine_regions_data(testing_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq')CPU times: user 7min 55s, sys: 4min 3s, total: 11min 58s
Wall time: 3min 34s
%%time
training_countries = TRAINING4
testing_countries = TESTING4
i = 4
training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
all_data, all_labels, all_hexagons = combine_regions_data(training_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_data, all_labels, all_hexagons = combine_regions_data(testing_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq')CPU times: user 8min 2s, sys: 4min 21s, total: 12min 23s
Wall time: 3min 57s
%%time
training_countries = TRAINING5
testing_countries = TESTING5
i = 5
training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
all_data, all_labels, all_hexagons = combine_regions_data(training_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
all_data, all_labels, all_hexagons = combine_regions_data(testing_regions, char_type=char_type)
all_data.to_parquet(f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq')CPU times: user 8min 4s, sys: 4min 19s, total: 12min 23s
Wall time: 3min 51s
%%time
# 6 is the insample model without spatial grouping (hexagons)
all_data, all_labels, all_hexagons = combine_regions_data(region_hulls.index.values, char_type=char_type)
from sklearn.model_selection import StratifiedKFold
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
splits = gkf.split(
all_data,
y=all_labels.final_without_noise,
)
split_label = np.empty(len(all_data), dtype=float)
for i, (train, test) in enumerate(splits):
split_label[test] = i
i = 6
# assign four of the groups as training data
train = split_label != 0
all_data.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_labels.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
# assign one of the groups as test data
test = split_label == 0
all_data.loc[test].to_parquet(f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_labels.loc[test].to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq')CPU times: user 11min 53s, sys: 7min 9s, total: 19min 3s
Wall time: 6min 55s
%%time
# 7 is the insample model with spatial grouping (hexagons)
all_data, all_labels, all_hexagons = combine_regions_data(region_hulls.index.values, char_type=char_type)
from sklearn.model_selection import StratifiedGroupKFold
gkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=123)
splits = gkf.split(
all_data,
y=all_labels.final_without_noise,
groups=all_hexagons.hexagons,
)
split_label = np.empty(len(all_data), dtype=float)
for i, (train, test) in enumerate(splits):
split_label[test] = i
i = 7
# assign four of the groups as training data
train = split_label != 0
all_data.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_data{i}.pq')
all_labels.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.loc[train].to_parquet(f'{coredir}processed_data/train_test_data/training_hexagons{i}.pq')
# assign one of the groups as test data
test = split_label == 0
all_data.loc[test].to_parquet(f'{coredir}processed_data/train_test_data/testing_data{i}.pq')
all_labels.loc[test].to_parquet(f'{coredir}processed_data/train_test_data/testing_labels{i}.pq')CPU times: user 13min 19s, sys: 6min 31s, total: 19min 51s
Wall time: 7min 56s
Plot testing regions
training_countries = TRAINING4
testing_countries = TESTING4
i = 4
training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
region_hulls.loc[testing_regions].explore()Make this Notebook Trusted to load map: File -> Trust Notebook