import geopandas as gpd
import pandas as pd
Train test split
Split the data into multiple train / test sets, so that the prediction accuracy of the model can be assest against every country.
= ['Germany', 'Poland', 'Czechia', 'Slovakia', 'Austria']
country_names = ['Poland', 'Germany', 'Austria', 'Czechia'], ['Slovakia']
TRAINING1, TESTING1 = ['Slovakia', 'Germany', 'Austria', 'Czechia'], ['Poland']
TRAINING2, TESTING2 = ['Poland', 'Slovakia', 'Austria', 'Czechia'], ['Germany']
TRAINING3, TESTING3 = ['Poland', 'Germany', 'Slovakia', 'Czechia'], ['Austria']
TRAINING4, TESTING4 = ['Poland', 'Germany', 'Austria', 'Slovakia'], ['Czechia'] TRAINING5, TESTING5
= "/data/uscuni-eurofab/"
regions_datadir = '/data/uscuni-eurofab/processed_data/tessellations/'
tessellations_dir = '/data/uscuni-eurofab/processed_data/buildings/'
buildings_dir
= '/data/uscuni-eurofab/processed_data/chars/'
data_dir = '/data/uscuni-eurofab/processed_data/target_clusters/'
target_dir = '/data/uscuni-eurofab/processed_data/hexagons/'
hex_dir
= gpd.read_parquet(
region_hulls + "regions/" + "ms_ce_region_hulls.parquet"
regions_datadir
)
Download a shapefile of all EU countries, if not present.
# download countries
# !wget https://gisco-services.ec.europa.eu/distribution/v2/countries/geojson/CNTR_RG_01M_2024_3035.geojson
Assign regions to countries
= gpd.read_file('CNTR_RG_01M_2024_3035.geojson').to_crs(epsg=3035)
countries = countries[countries['NAME_ENGL'].isin(country_names)] country_polygons
= country_polygons.sindex.query(region_hulls.geometry, predicate='intersects')
region_idxs, country_idxs = region_hulls.iloc[region_idxs].intersection(country_polygons.iloc[country_idxs], align=False).area intersections
country_polygons
CNTR_ID | CNTR_NAME | NAME_ENGL | NAME_FREN | ISO3_CODE | SVRG_UN | CAPT | EU_STAT | EFTA_STAT | CC_STAT | NAME_GERM | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | AT | Österreich | Austria | Autriche | AUT | UN Member State | Vienna | T | F | F | Österreich | MULTIPOLYGON (((4354847.685 2714710.627, 43552... |
55 | CZ | Česká Republika | Czechia | Tchéquie | CZE | UN Member State | Prague | T | F | F | Tschechien | MULTIPOLYGON (((4624842.426 3112217.365, 46255... |
56 | DE | Deutschland | Germany | Allemagne | DEU | UN Member State | Berlin | T | F | F | Deutschland | MULTIPOLYGON (((4355225.354 2715902.995, 43548... |
170 | PL | Polska | Poland | Pologne | POL | UN Member State | Warsaw | T | F | F | Polen | MULTIPOLYGON (((4852825.195 3556096.333, 48551... |
192 | SK | Slovensko | Slovakia | Slovaquie | SVK | UN Member State | Bratislava | T | F | F | Slowakei | MULTIPOLYGON (((5003133.924 2988592.038, 50037... |
= gpd.GeoDataFrame(
intersection_df
{'region_id': region_hulls.index[region_idxs].values,
'country': country_polygons.iloc[country_idxs, 2].values,
'intersection_area': intersections.values,
'geometry': region_hulls.iloc[region_idxs, 0].values
},=region_hulls.crs
crs
)
= intersection_df.sort_values('intersection_area', ascending=False)
intersection_df = intersection_df[~intersection_df.region_id.duplicated()].sort_values('region_id')
intersection_df assert (intersection_df.region_id == region_hulls.index).all()
def combine_regions_data(selected_regions):
"""Combine the morphometric data, target labels and hexagons from all of the 'selected_regions' into a single dataframe for model training."""
= []
all_data = []
all_labels = []
all_hexagons
for trid in selected_regions:
= pd.read_parquet(f'{data_dir}primary_chars_{trid}.parquet')
data = pd.read_parquet(f'{target_dir}{trid}_target.pq').set_index('index')
targets = pd.read_parquet(f'{hex_dir}{trid}_hexagon.pq').set_index('index')
hexagons
= data.index.join(targets.index, how='inner').join(hexagons.index, how='inner')
common_index = data.loc[common_index]
data = targets.loc[common_index]
targets = hexagons.loc[common_index]
hexagons
# record region_id in the index
= str(trid) + '_' + common_index.astype(str)
common_index
= data.set_index(common_index)
data = targets.set_index(common_index)
targets = hexagons.set_index(common_index)
hexagons
all_data.append(data)
all_labels.append(targets)
all_hexagons.append(hexagons)
= pd.concat(all_data)
all_data = pd.concat(all_labels)
all_labels = pd.concat(all_hexagons)
all_hexagons
return all_data, all_labels, all_hexagons
### Test
# selected_regions = [65806]
# all_data, all_labels, all_hexagons = combine_regions_data(selected_regions)
# all_data.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data1.pq')
# all_labels.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels1.pq')
# all_hexagons.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons1.pq')
Generate the final country level train/test datasets.
%%time
= TRAINING1
training_countries = TESTING1
testing_countries = 1
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions)
all_data, all_labels, all_hexagons
'/data/uscuni-eurofab/processed_data/train_test_data/training_data1.pq')
all_data.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels1.pq')
all_labels.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons1.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions)
all_data, all_labels, all_hexagons
'/data/uscuni-eurofab/processed_data/train_test_data/testing_data1.pq')
all_data.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_labels1.pq') all_labels.to_parquet(
CPU times: user 3min, sys: 1min 3s, total: 4min 3s
Wall time: 2min 23s
%%time
= TRAINING2
training_countries = TESTING2
testing_countries = 2
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
%%time
= TRAINING3
training_countries = TESTING3
testing_countries = 3
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
%%time
= TRAINING4
training_countries = TESTING4
testing_countries = 4
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(
KeyboardInterrupt
region_hulls.loc[testing_regions].explore()
Make this Notebook Trusted to load map: File -> Trust Notebook
%%time
= TRAINING5
training_countries = TESTING5
testing_countries = 5
i
= intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
training_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values
testing_regions
= combine_regions_data(training_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')
all_hexagons.to_parquet(
= combine_regions_data(testing_regions)
all_data, all_labels, all_hexagons
f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq') all_labels.to_parquet(