VirtualiZarr → Kerchunk json → Icechunk demo

Author: Eli Holmes modification of Rich Signell’s notebook

Workflow

Open one NetCDF virtually
Write one Kerchunk json file per NetCDF file
Reopen the reference artifacts directly with the Kerchunk parser
Concatenate along time
Write the combined virtual dataset to Icechunk

For this demo, REF_FORMAT = "json" by default because JSON is easy to inspect. Change it to "parquet" for the same workflow using Parquet reference artifacts.

# Colab users, uncomment and run this
#!pip install -q icechunk  virtualizarr xarray obspec_utils obstore hvplot

import math
import shutil
import warnings
from pathlib import Path

import numpy as np
import xarray as xr
import icechunk
from obstore.store import from_url
from obspec_utils.registry import ObjectStoreRegistry
from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import HDFParser, KerchunkJSONParser, KerchunkParquetParser

warnings.filterwarnings(
    "ignore",
    message="Numcodecs codecs are not in the Zarr version 3 specification*",
    category=UserWarning,
)

Setup

Use 5 consecutive daily NDVI files from the public NOAA CDR NDVI S3 bucket.

bucket = "s3://noaa-cdr-ndvi-pds"
base = "data/2000"

filenames = [
    "AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc",
    "AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc",
    "AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc",
    "AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc",
    "AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc",
]

urls = [f"{bucket}/{base}/{name}" for name in filenames]

store = from_url(bucket, region="us-east-1", skip_signature=True)
registry = ObjectStoreRegistry({bucket: store})
parser = HDFParser()

# For teaching, JSON is easier to inspect.
# Change to "parquet" to write Kerchunk Parquet reference directories instead.
REF_FORMAT = "json"  # "json" or "parquet"
REF_SUFFIX = ".json" if REF_FORMAT == "json" else ".parquet"

ref_dir = Path(f"ndvi_refs_{REF_FORMAT}")

urls

['s3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc',
 's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc',
 's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc',
 's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc',
 's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc']

Helper: remove problematic `_FillValue` attrs

The NDVI source files can carry _FillValue: NaN on coordinate metadata such as latitude. That is not valid strict JSON metadata, and it can also confuse the xarray/Zarr v3 read path.

For this teaching demo, we keep this simple: remove attr-level _FillValue entries before writing refs and before writing Icechunk. The actual virtual chunk references are unchanged.

def clean_attrs(ds):
    # Drop attr-level _FillValue and any NaN attrs so refs/Icechunk metadata are strict JSON.
    ds = ds.copy(deep=False)

    def clean_one(attrs):
        out = {}
        for key, value in dict(attrs).items():
            if key == "_FillValue":
                continue

            if isinstance(value, np.generic):
                value = value.item()

            if isinstance(value, float) and math.isnan(value):
                continue

            out[key] = value
        return out

    ds.attrs = clean_one(ds.attrs)
    for name in ds.variables:
        ds[name].attrs = clean_one(ds[name].attrs)

    return ds

1. Open one NetCDF virtually

This checks that VirtualiZarr can read the NetCDF metadata. This does not download the NDVI data chunks.

vds = open_virtual_dataset(
    url=urls[0],
    parser=parser,
    registry=registry,
    loadable_variables=["time", "latitude", "longitude"],
    decode_times=True,
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")

vds = clean_attrs(vds)
vds

# NDVI is virtual: the data are references back to the original NetCDF file.
vds["NDVI"].data

2. Write one Kerchunk reference artifact per NetCDF

This is intentionally serial and simple for teaching. In a real batch run, each machine could run this same loop for its assigned files.

With REF_FORMAT = "json", each output is one .json file. With REF_FORMAT = "parquet", each output is a .parquet directory.

# Start clean for the demo so stale refs do not cause confusing errors.
shutil.rmtree(ref_dir, ignore_errors=True)
ref_dir.mkdir(exist_ok=True)

ref_files = []

for url in urls:
    out = ref_dir / Path(url).name.replace(".nc", REF_SUFFIX)

    vds = open_virtual_dataset(
        url=url,
        parser=parser,
        registry=registry,
        loadable_variables=["time", "latitude", "longitude"],
        decode_times=True,
    ).drop_vars(["nv", "ncrs", "crs"], errors="ignore")

    vds = clean_attrs(vds)

    vds.vz.to_kerchunk(
        filepath=str(out),
        format=REF_FORMAT,
    )

    ref_files.append(out)
    print("wrote", out)

ref_files = sorted(ref_files)
ref_files

wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.json

[PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.json'),
 PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.json'),
 PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.json'),
 PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.json'),
 PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.json')]

3. Reopen the reference artifacts and concatenate

Important: we reopen the Kerchunk refs with the parser directly instead of open_virtual_dataset(...). That avoids an xarray/Zarr v3 metadata decoding path that can fail on these files.

The files have a scalar CRS coordinate in the original NetCDF metadata, so we do not use coordinate-based combining. We explicitly concatenate in filename order along time.

ref_files = sorted(ref_dir.glob(f"*{REF_SUFFIX}"))
print(len(ref_files), "reference artifacts")
ref_files[:2]

5 reference artifacts

[PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.json'),
 PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.json')]

ref_root = f"file://{ref_dir.resolve()}"
ref_store = from_url(ref_root)

merge_registry = ObjectStoreRegistry({
    bucket: store,
    ref_root: ref_store,
})

ref_parser = KerchunkJSONParser() if REF_FORMAT == "json" else KerchunkParquetParser()

vds_list = []
for ref_file in ref_files:
    manifest_store = ref_parser(
        url=f"{ref_root}/{ref_file.name}",
        registry=merge_registry,
    )

    # Fully virtual dataset; avoids open_virtual_dataset(...), which internally calls xr.open_zarr(...).
    ds_one = manifest_store._group.to_virtual_dataset()
    ds_one = ds_one.drop_vars(["nv", "ncrs", "crs"], errors="ignore")
    ds_one = clean_attrs(ds_one)
    vds_list.append(ds_one)

combined_vds = xr.concat(
    vds_list,
    dim="time",
    data_vars="minimal",
    coords="minimal",
    combine_attrs="drop_conflicts",
    compat="override",
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")

combined_vds = clean_attrs(combined_vds)
combined_vds

4. Write the combined virtual dataset to Icechunk

The final Icechunk repo contains virtual references to the original S3 NetCDF chunks. It does not copy the NDVI data.

repo_path = Path("ndvi-icechunk-kerchunk")
shutil.rmtree(repo_path, ignore_errors=True)

config = icechunk.RepositoryConfig.default()
config.set_virtual_chunk_container(
    icechunk.VirtualChunkContainer(
        url_prefix="s3://noaa-cdr-ndvi-pds/",
        store=icechunk.s3_store(region="us-east-1", anonymous=True),
    ),
)

storage = icechunk.local_filesystem_storage(str(repo_path))
repo = icechunk.Repository.create(storage, config)
session = repo.writable_session("main")

combined_vds = clean_attrs(combined_vds)
combined_vds.vz.to_icechunk(session.store)

snapshot_id = session.commit("Add 5-day NDVI CDR virtual references")
print("Committed:", snapshot_id)

  2026-06-04T02:47:05.659155Z  WARN icechunk_arrow_object_store: The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores.

    at icechunk-arrow-object-store/src/lib.rs:196



Committed: 48090R4J16894CSXRW1G

5. Read back the Icechunk store

Authorize Icechunk to read the original public S3 chunks, then open the Icechunk store with xarray.

credentials = icechunk.containers_credentials({
    "s3://noaa-cdr-ndvi-pds/": icechunk.s3_credentials(anonymous=True)
})

repo2 = icechunk.Repository.open(
    storage,
    config,
    authorize_virtual_chunk_access=credentials,
)

session2 = repo2.readonly_session("main")

ds = xr.open_zarr(
    session2.store,
    consolidated=False,
    chunks=None,
)

ds

# Optional quick check: read one small-ish slice from the virtual dataset.
ds["NDVI"].isel(time=0, latitude=slice(0, 10), longitude=slice(0, 10)).values

Optional plot

This reads data lazily from the original S3 NetCDF files through Icechunk.

import hvplot.xarray  # noqa

ds["NDVI"].isel(time=0).hvplot(
    rasterize=True,
    geo=True,
    global_extent=True,
    x="longitude",
    y="latitude",
    tiles="OSM",
    cmap="YlGn",
    clim=(-0.1, 1.0),
    title="AVHRR NDVI — 2000-01-01",
    width=800,
    height=400,
)