Pre-processing UERRA and EURO-CORDEX datasets#

Preparation Work#

Load libraries#

import itertools
import os

import cdsapi
import dask.diagnostics
import gisco_geodata
import numpy as np
import regionmask
import xarray as xr

Define area of interest#

Specify a name for your area of interest (region_name) and load a corresponding shape with geopandas (region_gdf).

For example, access NUTS regions from GISCO:

# Specify a name for labelling outputs
region_name = 'IT'

nuts = gisco_geodata.NUTS()
# Geopandas dataframe with the shape of the region
region_gdf = nuts.get(
    countries=region_name,  # put a NUTS ID here
    nuts_level='LEVL_0',  # adjust the NUTS level to match your ID
    scale='20M',  # select data resolution (1M, 3M, 10M, 20M or 60M)
    spatial_type='RG',
    projection='4326'
)

Path configuration#

# Download folder for UERRA datasets
input_folder = '../data'
os.makedirs(input_folder, exist_ok=True)

# Local data after preprocessing
output_folder = f'../data_{region_name}'
os.makedirs(output_folder, exist_ok=True)

CDS client setup#

To access the data on CDS, you need an ECMWF account. See the guide on how to set up the API for more information.

URL = 'https://cds.climate.copernicus.eu/api'
KEY = None  # add your key or provide via ~/.cdsapirc

cds_client = cdsapi.Client(url=URL, key=KEY)

Reanalysis data (UERRA)#

You can download the dataset UERRA regional reanalysis from the Climate Data Store. The UERRA dataset contains analyses of surface and near-surface essential climate variables from UERRA-HARMONIE and MESCAN-SURFEX systems. For this assessment the MESCAN-SURFEX was downloaded considering two different variables:

  • Air Temperature: At 2 meters above the surface (commonly referred to as 2m temperature).

  • Total Precipitation: The total amount of water (both liquid and solid forms) falling onto the ground or water surface. This dataset includes all types of precipitation and represents an accumulated value over 24 hours, from 06:00 on one day to 06:00 the following day.

Hide code cell source
def bbox_islice(mask):
    """Slices for bounding box of a masked area"""
    idxs = np.where(mask.notnull())
    slc = {
        'y': slice(np.min(idxs[0]), np.max(idxs[0])+1),
        'x': slice(np.min(idxs[1]), np.max(idxs[1])+1),
    }
    return mask.isel(slc), slc


def clip_and_mask(ds, gdf):
    """Mask the data with the given shape and crop to bbox"""
    mask = regionmask.mask_geopandas(gdf, ds)
    mask, slc = bbox_islice(mask)
    return ds.isel(slc).where(mask.notnull())


def uerra_request(variable, years):
    """Request UERRA reanalysis data from the CDS"""
    if isinstance(years, str):
        years = [years]

    dataset = "reanalysis-uerra-europe-single-levels"
    request = {
        "origin": "mescan_surfex",
        "variable": variable,
        "year": years,
        "month": [
            "01", "02", "03", "04", "05", "06", "07", "08", "09",
            "10", "11", "12"
        ],
        "day": [
            "01", "02", "03", "04", "05", "06", "07", "08", "09",
            "10", "11", "12", "13", "14", "15", "16", "17", "18",
            "19", "20", "21", "22", "23", "24", "25", "26", "27",
            "28", "29", "30", "31"
        ],
        "time": ["00:00", "06:00", "12:00", "18:00"],
        "data_format": "netcdf"
    }

    filename = os.path.join(input_folder, 'UERRA-{}-{}.nc'.format(variable, '-'.join(years)))
    return cds_client.retrieve(dataset, request).download(filename)

Time interval#

Which years to download data for and process?

start_year = 1981
end_year = 2010

# Generate the list of years and corresponding slice for selection
years = [str(y) for y in range(start_year, end_year+1, 1)]
years_slice = slice(str(start_year), str(end_year))

Air Temperature#

Download 2 meter temperature: (~2.3 GB per year)

for y in years:
    print(f"Requesting data for year {y}", flush=True)
    uerra_request(variable="2m_temperature", years=y)

Load, merge and clip all downloaded temperature datasets and convert temperature from Kelvin to Celsius and add the units for temperature (°C), then write to the output folder for the selected region:

# Load and clip the dataset
ds_t2m = (
    xr.open_mfdataset(
        os.path.join(input_folder, 'UERRA-2m_temperature-*.nc'),
        combine='by_coords',
        chunks='auto'
    )
    .rename({'valid_time': 'time'})
    # Clip to specified time interval
    .sel({'time': years_slice})
    # Clip to selected region
    .pipe(clip_and_mask, region_gdf)
)

# Convert temperature from Kelvin to Celsius
ds_t2m['t2m'] = ds_t2m['t2m'] - 273.15
ds_t2m['t2m'].attrs['units'] = 'C'  # Add units attribute

# Save the processed and clipped data to a new NetCDF file
with dask.diagnostics.ProgressBar():
    output_file_path = os.path.join(output_folder, 'UERRA-t2m.nc')
    ds_t2m.to_netcdf(output_file_path, encoding={'t2m': {'compression': 'zlib'}})

print(f'Saved processed file to: {output_file_path}')

Total Precipitation#

Download total precipitation: (~0.4 GB per year)

for ys in itertools.batched(years, 5):
    print(f"Requesting data for years {ys}", flush=True)
    uerra_request(variable="total_precipitation", years=ys)

Load, merge and clip all downloaded precipitation datasets, then write to the output folder for the selected region:

# Load and clip the dataset
ds_tp = (
    xr.open_mfdataset(
        os.path.join(input_folder, 'UERRA-total_precipitation-*.nc'),
        combine='by_coords',
        chunks='auto'
    )
    .rename({'valid_time': 'time'})
    # Clip to time interval
    .sel({'time': years_slice})
    # Clip to selected region
    .pipe(clip_and_mask, region_gdf)
)

# Precipitation given in kg/m² for a 24h period which is mm/day
ds_tp['tp'].attrs['units'] = 'mm/d'  # Add units attribute

# Save the processed and clipped data to a new NetCDF file
with dask.diagnostics.ProgressBar():
    output_file_path = os.path.join(output_folder, 'UERRA-tp.nc')
    ds_tp.to_netcdf(output_file_path, encoding={'tp': {'compression': 'zlib'}})

print(f'Saved processed file to: {output_file_path}')