Train test split

Split the data into multiple train / test sets, so that the prediction accuracy of the model can be assest against every country.

import geopandas as gpd
import pandas as pd
country_names = ['Germany', 'Poland', 'Czechia', 'Slovakia', 'Austria']
TRAINING1, TESTING1 = ['Poland', 'Germany', 'Austria', 'Czechia'], ['Slovakia']
TRAINING2, TESTING2 = ['Slovakia', 'Germany', 'Austria', 'Czechia'], ['Poland']
TRAINING3, TESTING3 = ['Poland', 'Slovakia', 'Austria', 'Czechia'], ['Germany']
TRAINING4, TESTING4 = ['Poland', 'Germany', 'Slovakia', 'Czechia'], ['Austria']
TRAINING5, TESTING5 = ['Poland', 'Germany', 'Austria', 'Slovakia'], ['Czechia']
regions_datadir = "/data/uscuni-eurofab/"
tessellations_dir = '/data/uscuni-eurofab/processed_data/tessellations/'
buildings_dir = '/data/uscuni-eurofab/processed_data/buildings/'

data_dir = '/data/uscuni-eurofab/processed_data/chars/'
target_dir = '/data/uscuni-eurofab/processed_data/target_clusters/'
hex_dir = '/data/uscuni-eurofab/processed_data/hexagons/'

region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "ms_ce_region_hulls.parquet"
    )

Download a shapefile of all EU countries, if not present.

# download countries
# !wget https://gisco-services.ec.europa.eu/distribution/v2/countries/geojson/CNTR_RG_01M_2024_3035.geojson

Assign regions to countries

countries = gpd.read_file('CNTR_RG_01M_2024_3035.geojson').to_crs(epsg=3035)
country_polygons = countries[countries['NAME_ENGL'].isin(country_names)]
region_idxs, country_idxs = country_polygons.sindex.query(region_hulls.geometry, predicate='intersects')
intersections = region_hulls.iloc[region_idxs].intersection(country_polygons.iloc[country_idxs], align=False).area
country_polygons
CNTR_ID CNTR_NAME NAME_ENGL NAME_FREN ISO3_CODE SVRG_UN CAPT EU_STAT EFTA_STAT CC_STAT NAME_GERM geometry
11 AT Österreich Austria Autriche AUT UN Member State Vienna T F F Österreich MULTIPOLYGON (((4354847.685 2714710.627, 43552...
55 CZ Česká Republika Czechia Tchéquie CZE UN Member State Prague T F F Tschechien MULTIPOLYGON (((4624842.426 3112217.365, 46255...
56 DE Deutschland Germany Allemagne DEU UN Member State Berlin T F F Deutschland MULTIPOLYGON (((4355225.354 2715902.995, 43548...
170 PL Polska Poland Pologne POL UN Member State Warsaw T F F Polen MULTIPOLYGON (((4852825.195 3556096.333, 48551...
192 SK Slovensko Slovakia Slovaquie SVK UN Member State Bratislava T F F Slowakei MULTIPOLYGON (((5003133.924 2988592.038, 50037...
intersection_df = gpd.GeoDataFrame(
    {
    'region_id': region_hulls.index[region_idxs].values,
    'country': country_polygons.iloc[country_idxs, 2].values,
    'intersection_area': intersections.values,
    'geometry': region_hulls.iloc[region_idxs, 0].values
    },
    crs=region_hulls.crs
                                   
                                  
)

intersection_df = intersection_df.sort_values('intersection_area', ascending=False)
intersection_df = intersection_df[~intersection_df.region_id.duplicated()].sort_values('region_id')
assert (intersection_df.region_id == region_hulls.index).all()
def combine_regions_data(selected_regions):
    """Combine the morphometric data, target labels and hexagons from all of the 'selected_regions' into a single dataframe for model training."""
    
    all_data = []
    all_labels = []
    all_hexagons = []
    
    for trid in selected_regions:
        data = pd.read_parquet(f'{data_dir}primary_chars_{trid}.parquet')
        targets = pd.read_parquet(f'{target_dir}{trid}_target.pq').set_index('index')
        hexagons = pd.read_parquet(f'{hex_dir}{trid}_hexagon.pq').set_index('index')
    
        common_index = data.index.join(targets.index, how='inner').join(hexagons.index, how='inner')
        data = data.loc[common_index]
        targets = targets.loc[common_index]
        hexagons = hexagons.loc[common_index]
    
        # record region_id in the index
        common_index = str(trid) + '_' + common_index.astype(str)
        
        data = data.set_index(common_index)
        targets = targets.set_index(common_index)
        hexagons = hexagons.set_index(common_index)
    
        all_data.append(data)
        all_labels.append(targets)
        all_hexagons.append(hexagons)
    
    
    all_data = pd.concat(all_data)
    all_labels = pd.concat(all_labels)
    all_hexagons = pd.concat(all_hexagons)
    
    return all_data, all_labels, all_hexagons
### Test


# selected_regions = [65806]
# all_data, all_labels, all_hexagons = combine_regions_data(selected_regions)

# all_data.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data1.pq')
# all_labels.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels1.pq')
# all_hexagons.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons1.pq')

Generate the final country level train/test datasets.

%%time

training_countries = TRAINING1
testing_countries = TESTING1
i = 1

training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values

all_data, all_labels, all_hexagons = combine_regions_data(training_regions)

all_data.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data1.pq')
all_labels.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels1.pq')
all_hexagons.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons1.pq')


all_data, all_labels, all_hexagons = combine_regions_data(testing_regions)

all_data.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_data1.pq')
all_labels.to_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_labels1.pq')
CPU times: user 3min, sys: 1min 3s, total: 4min 3s
Wall time: 2min 23s
%%time

training_countries = TRAINING2
testing_countries = TESTING2
i = 2

training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values

all_data, all_labels, all_hexagons = combine_regions_data(training_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')


all_data, all_labels, all_hexagons = combine_regions_data(testing_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq')
%%time

training_countries = TRAINING3
testing_countries = TESTING3
i = 3

training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values

all_data, all_labels, all_hexagons = combine_regions_data(training_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')


all_data, all_labels, all_hexagons = combine_regions_data(testing_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq')
%%time

training_countries = TRAINING4
testing_countries = TESTING4
i = 4

training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values

all_data, all_labels, all_hexagons = combine_regions_data(training_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')


all_data, all_labels, all_hexagons = combine_regions_data(testing_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq')

KeyboardInterrupt
region_hulls.loc[testing_regions].explore()
Make this Notebook Trusted to load map: File -> Trust Notebook
%%time

training_countries = TRAINING5
testing_countries = TESTING5
i = 5

training_regions = intersection_df[intersection_df['country'].isin(training_countries)].region_id.values
testing_regions = intersection_df[intersection_df['country'].isin(testing_countries)].region_id.values

all_data, all_labels, all_hexagons = combine_regions_data(training_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{i}.pq')
all_hexagons.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons{i}.pq')


all_data, all_labels, all_hexagons = combine_regions_data(testing_regions)

all_data.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{i}.pq')
all_labels.to_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{i}.pq')