Surface Water Level Monitoring with Sentinel-2
Timeseries of water covered area at Santa Giustina lake, Italy

Table of contents¶
Introduction¶
This notebook demonstrates how to estimate and visualize surface water area measurements at Santa Giustina lake, Italy, using Sentinel-2 data stored in EOPF-Zarr format.
The lake¶
Lago di Santa Giustina is an artificial lake located in the Trentino region of northern Italy. It was created in the 1950s by damming the Noce River for hydroelectric power generation. The lake is situated at an elevation of approximately 500 meters above sea level and has a surface area of about 3.5 square kilometers.

Image source: By Mosco - Own work, CC BY-SA 3.0, https://
Setup¶
Start importing the necessary libraries
import xarray as xr
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pystac_client
from rasterio.features import geometry_mask
from skimage.filters import threshold_otsu
import pandas as pd
from skimage import measure
from pyproj import Transformer
import dask
from dask_gateway import Gateway
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)"""
gate = Gateway("https://dask.user.eopf.eodc.eu",
proxy_address="tls://dask.user.eopf.eodc.eu:10000",
auth="jupyterhub")
"""
# Dask Gateway automatically loads the correct configuration.
# So the code below is identical to the comment above
gate = Gateway()
cluster = gate.new_cluster()
cluster.scale(8)client = cluster.get_client()
clientAccess data¶
Query the EOPF Sentinel-2 STAC Collection for the area and time of interest.
catalog = pystac_client.Client.open("https://stac.core.eopf.eodc.eu")
bbox = [
11.003288060439315,
46.34055268553584,
11.10369555555596,
46.40183319621778,
]
items = list(
catalog.search(
collections=["sentinel-2-l2a"],
bbox=bbox,
datetime=["2020-12-31", "2024-01-01"],
).items()
)
print(f"items found: {len(items)}")items found: 434
Combine the products into a single datacube:
%%time
def reproject_bbox(bbox, src_crs="EPSG:4326", dst_crs="EPSG:32632"):
transformer = Transformer.from_crs(src_crs, dst_crs, always_xy=True)
xmin, ymin = transformer.transform(bbox[0], bbox[1])
xmax, ymax = transformer.transform(bbox[2], bbox[3])
return [xmin, ymin, xmax, ymax]
def load_tile(item, bbox_ll):
href = item.assets["product"].href
ds = xr.open_datatree(href, engine="zarr", chunks={}, mask_and_scale=True)
dst_crs = item.properties.get("proj:code")
if dst_crs is None:
raise ValueError(f"No proj:code in item {item.id}")
utm_bbox = reproject_bbox(bbox_ll, dst_crs=dst_crs)
ds_ref = (
ds["measurements"]["reflectance"]["r20m"]
.to_dataset()[["b02", "b03", "b04", "b8a"]]
.sel(x=slice(utm_bbox[0], utm_bbox[2]), y=slice(utm_bbox[3], utm_bbox[1]))
.expand_dims(time=[item.datetime])
)
ds_scl = (
ds["conditions"]["mask"]["l2a_classification"]["r20m"]
.to_dataset()
.sel(x=slice(utm_bbox[0], utm_bbox[2]), y=slice(utm_bbox[3], utm_bbox[1]))
.expand_dims(time=[item.datetime])
)
return xr.merge([ds_ref, ds_scl])
delayed_results = [dask.delayed(load_tile)(item, bbox) for item in items]
results = dask.compute(*delayed_results)
# Get CRS from STAC item
crs = items[0].properties.get("proj:code")
datacube = xr.concat(results, dim="time").sortby("time")
datacube = datacube.rio.write_crs(crs, inplace=True)
datacubeOpen the geoJSON file containing the lake boundaries:
lake_gdf = gpd.read_file("./geojson/lago_santa_giustina.geojson")
lake_gdf = lake_gdf.to_crs(crs)Visualize datacube sample¶
In this section we visualize an RGB composite for a random date and use the lake polygon as an outline to see if it is aligned correctly.
datacube_sample = datacube.isel(time=50)
rgb = datacube_sample[["b04", "b03", "b02"]].to_dataarray(dim="band")
rgb = rgb.assign_coords(band=["red", "green", "blue"])
vmin, vmax = 0, 0.25
rgb_scaled = (rgb - vmin) / (vmax - vmin)
rgb_scaled = rgb_scaled.clip(0, 1)
rgb_img = rgb_scaled.transpose("y", "x", "band").values
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(
rgb_img,
extent=[
float(rgb.x.min()),
float(rgb.x.max()),
float(rgb.y.min()),
float(rgb.y.max()),
],
)
lake_gdf.boundary.plot(ax=ax, edgecolor="yellow", linewidth=2)
ax.set_title("RGB composite with lake boundary overlay")
ax.axis("off")
plt.show()
Cloud masking¶
In this section we apply the scl band to mask out the clouds. We visualize the same RGB composite as before, but with the mask applied.
valid_classes = [2, 4, 5, 6]
mask = datacube["scl"].isin(valid_classes)
datacube_masked = datacube.where(mask)
datacube_sample = datacube_masked.isel(time=50)
rgb = datacube_sample[["b04", "b03", "b02"]].to_dataarray(dim="band")
rgb = rgb.assign_coords(band=["red", "green", "blue"])
vmin, vmax = 0, 0.25
rgb_scaled = (rgb - vmin) / (vmax - vmin)
rgb_scaled = rgb_scaled.clip(0, 1) # keep in [0, 1]
rgb_img = rgb_scaled.transpose("y", "x", "band").values
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(
rgb_img,
extent=[
float(rgb.x.min()),
float(rgb.x.max()),
float(rgb.y.min()),
float(rgb.y.max()),
],
)
lake_gdf.boundary.plot(ax=ax, edgecolor="yellow", linewidth=2)
ax.set_title("Cloud-masked RGB composite")
ax.axis("off")
plt.show()
We keep only the dates with a cloud coverage less than 5% over the lake:
valid_classes = [4, 5, 6]
threshold = 0.05
datacube_lake = datacube.rio.clip(lake_gdf.geometry, lake_gdf.crs)
clear_mask = datacube_lake["scl"].isin(valid_classes)
clear_fraction = clear_mask.mean(dim=("y", "x")).compute()
mask_good = clear_fraction >= threshold
datacube_clear = datacube_lake.isel(time=mask_good)
datacube_clearNDWI¶
Water detection is performed using the Normalized Difference Water Index (NDWI), which utilizes the green and near-infrared (NIR) bands of the Sentinel-2 imagery. The NDWI is calculated as follows:
ndwi = (datacube_clear["b03"] - datacube_clear["b8a"]) / (
datacube_clear["b03"] + datacube_clear["b8a"]
)Water detection¶
The water area is selected applying Otsu Thresholding:
def otsu_mask(arr):
arr = np.clip(np.nan_to_num(arr, nan=0.0), -1, 1)
valid = arr > -0.2
if np.count_nonzero(valid) < 10:
return np.zeros_like(arr, dtype=bool)
try:
thr = threshold_otsu(arr[valid])
except Exception:
thr = 0.0
thr -= 0.05
return arr > thr
ndwi = ndwi.chunk({"x": -1, "y": -1})
water_masks = xr.apply_ufunc(
otsu_mask,
ndwi,
input_core_dims=[["y", "x"]],
output_core_dims=[["y", "x"]],
vectorize=True,
dask="parallelized",
output_dtypes=[bool],
)
water_masks.compute()
water_masksSample visualization of computed water mask:
target_time = "2021-04"
datacube_full_rgb = datacube.sel(time=target_time, method="nearest")[
["b04", "b03", "b02"]
].to_dataarray("band")
datacube_full_rgb = datacube_full_rgb.assign_coords(band=["red", "green", "blue"])
vmin, vmax = 0, 0.25
rgb_scaled_full = (datacube_full_rgb - vmin) / (vmax - vmin)
rgb_scaled_full = rgb_scaled_full.clip(0, 1)
rgb_img_full = rgb_scaled_full.transpose("y", "x", "band").values
water_mask_da = water_masks.sel(time=target_time, method="nearest")
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(
rgb_img_full,
extent=[
float(datacube_full_rgb.x.min()),
float(datacube_full_rgb.x.max()),
float(datacube_full_rgb.y.min()),
float(datacube_full_rgb.y.max()),
],
)
water_mask = water_mask_da.values
x_coords = water_mask_da.x.values
y_coords = water_mask_da.y.values
contours = measure.find_contours(water_mask, 0.5)
for contour in contours:
x_contour = np.interp(contour[:, 1], np.arange(len(x_coords)), x_coords)
y_contour = np.interp(contour[:, 0], np.arange(len(y_coords)), y_coords)
ax.plot(x_contour, y_contour, color="cyan", linewidth=1.5)
ax.set_title("Lake water mask over full-scene RGB (April 2021)")
ax.axis("off")
plt.show()
lake_masked = []
for t in range(len(datacube_clear.time)):
wm = xr.DataArray(
water_masks[t],
dims=("y", "x"),
coords={"y": datacube_clear.y, "x": datacube_clear.x},
name="water_mask",
).rio.write_crs(datacube_clear.rio.crs)
mask = geometry_mask(
[lake_gdf.geometry.unary_union],
transform=wm.rio.transform(),
invert=True,
out_shape=wm.shape,
)
wm_clipped = wm.where(mask)
lake_masked.append(wm_clipped)
water_masks_clipped = xr.concat(lake_masked, dim="time")water_fraction = water_masks_clipped.sum(dim=("y", "x")) / (
water_masks_clipped.notnull().sum(dim=("y", "x"))
)Water surface area extraction¶
df_water = pd.DataFrame(
{"time": water_fraction.time.values, "water_fraction": water_fraction.values}
)
df_water.set_index("time", inplace=True)pixel_area_m2 = 20 * 20
water_pixel_count = (water_masks_clipped == 1).sum(dim=("y", "x"))
water_area_m2 = water_pixel_count * pixel_area_m2
water_area_m2 = water_area_m2.assign_coords(time=water_masks_clipped.time)
dates = pd.to_datetime(water_masks.time.values)
water_area_ts = pd.Series(water_area_m2.values, index=dates)smoothed = water_area_ts.rolling(window=5, center=True, min_periods=1).mean()
plt.figure(figsize=(12, 5))
plt.plot(
water_area_ts.index,
water_area_ts.values / 1e6,
color="lightgray",
alpha=0.6,
label="Raw",
)
plt.plot(
smoothed.index,
smoothed.values / 1e6,
color="blue",
linewidth=2,
label="rolling mean",
)
plt.ylabel("Lake surface area (km²)")
plt.xlabel("Date")
plt.title("Lake surface area over time")
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.show()
cluster.shutdown()