import concurrent.futures import functools import io import pandas as pd import tensorflow as tf from tqdm.notebook import tqdm def read_pandas_csv(url, **read_opts): # This method is significantly faster for reading files stored in GCS. with tf.io.gfile.GFile(url, mode='rb') as f: return pd.read_csv(io.BytesIO(f.read()), **read_opts) # Get all S2 cell tokens that contain buildings data. # NOTE: Reading files directly from GCS is faster than the http REST endpoint. url_root = "gs://open-buildings-data/v2" # url_root = "https://storage.googleapis.com/open-buildings-data/v2" tokens = read_pandas_csv(f"{url_root}/score_thresholds_s2_level_4.csv").s2_token # The polygon type can be "points" (centroid) or "polygon" (footprint). poly_type = "points" #@param ["points", "polygons"] # Create a list with all URLs that we must download data from. fnames = [f"{token}_buildings.csv.gz" for token in tokens] poly_path = f"{url_root}/{poly_type}_s2_level_4_gzip" urls = [f"{poly_path}/{fname}" for fname in fnames] # Create a function that reads only a subset of fields given a URL. columns = ["latitude", "longitude", "confidence"] read_opts = dict(usecols=columns, compression='gzip') map_func = functools.partial(read_pandas_csv, **read_opts) with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: futures = [executor.submit(map_func, url) for url in urls] completed = tqdm(concurrent.futures.as_completed(futures), total=len(futures)) table_iter = (future.result() for future in completed) df = pd.concat(table_iter, copy=False, ignore_index=True) |