# Colab users, uncomment and run this
#!pip install -q icechunk virtualizarr xarray obspec_utils obstore hvplotVirtualiZarr → Kerchunk json → Icechunk demo
Workflow
- Open one NetCDF virtually
- Write one Kerchunk json file per NetCDF file
- Reopen the reference artifacts directly with the Kerchunk parser
- Concatenate along
time - Write the combined virtual dataset to Icechunk
For this demo, REF_FORMAT = "json" by default because JSON is easy to inspect. Change it to "parquet" for the same workflow using Parquet reference artifacts.
import math
import shutil
import warnings
from pathlib import Path
import numpy as np
import xarray as xr
import icechunk
from obstore.store import from_url
from obspec_utils.registry import ObjectStoreRegistry
from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import HDFParser, KerchunkJSONParser, KerchunkParquetParser
warnings.filterwarnings(
"ignore",
message="Numcodecs codecs are not in the Zarr version 3 specification*",
category=UserWarning,
)Setup
Use 5 consecutive daily NDVI files from the public NOAA CDR NDVI S3 bucket.
bucket = "s3://noaa-cdr-ndvi-pds"
base = "data/2000"
filenames = [
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc",
]
urls = [f"{bucket}/{base}/{name}" for name in filenames]
store = from_url(bucket, region="us-east-1", skip_signature=True)
registry = ObjectStoreRegistry({bucket: store})
parser = HDFParser()
# For teaching, JSON is easier to inspect.
# Change to "parquet" to write Kerchunk Parquet reference directories instead.
REF_FORMAT = "json" # "json" or "parquet"
REF_SUFFIX = ".json" if REF_FORMAT == "json" else ".parquet"
ref_dir = Path(f"ndvi_refs_{REF_FORMAT}")
urls['s3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc',
's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc',
's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc',
's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc',
's3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc']
Helper: remove problematic _FillValue attrs
The NDVI source files can carry _FillValue: NaN on coordinate metadata such as latitude. That is not valid strict JSON metadata, and it can also confuse the xarray/Zarr v3 read path.
For this teaching demo, we keep this simple: remove attr-level _FillValue entries before writing refs and before writing Icechunk. The actual virtual chunk references are unchanged.
def clean_attrs(ds):
# Drop attr-level _FillValue and any NaN attrs so refs/Icechunk metadata are strict JSON.
ds = ds.copy(deep=False)
def clean_one(attrs):
out = {}
for key, value in dict(attrs).items():
if key == "_FillValue":
continue
if isinstance(value, np.generic):
value = value.item()
if isinstance(value, float) and math.isnan(value):
continue
out[key] = value
return out
ds.attrs = clean_one(ds.attrs)
for name in ds.variables:
ds[name].attrs = clean_one(ds[name].attrs)
return ds1. Open one NetCDF virtually
This checks that VirtualiZarr can read the NetCDF metadata. This does not download the NDVI data chunks.
vds = open_virtual_dataset(
url=urls[0],
parser=parser,
registry=registry,
loadable_variables=["time", "latitude", "longitude"],
decode_times=True,
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")
vds = clean_attrs(vds)
vds<xarray.Dataset> Size: 156MB
Dimensions: (latitude: 3600, longitude: 7200, time: 1, nv: 2)
Coordinates:
* latitude (latitude) float32 14kB 89.97 89.93 89.88 ... -89.93 -89.97
* longitude (longitude) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
* time (time) datetime64[ns] 8B 2000-01-01
Dimensions without coordinates: nv
Data variables:
lat_bnds (latitude, nv) float32 29kB ManifestArray<shape=(3600, 2), dty...
lon_bnds (longitude, nv) float32 58kB ManifestArray<shape=(7200, 2), dt...
NDVI (time, latitude, longitude) int16 52MB ManifestArray<shape=(1,...
TIMEOFDAY (time, latitude, longitude) int16 52MB ManifestArray<shape=(1,...
QA (time, latitude, longitude) int16 52MB ManifestArray<shape=(1,...
Attributes: (12/48)
title: Normalized Difference Vegetation ...
institution: NASA/GSFC/SED/ESD/HBSL/TIS/MODIS-...
Conventions: CF-1.6, ACDD-1.3
standard_name_vocabulary: CF Standard Name Table (v25, 05 J...
naming_authority: gov.noaa.ncei
license: See the Use Agreement for this CD...
... ...
PercentValidDaytimeData: 32.03
PercentValidDaytimeLand: 32.03
PercentValidClearDaytimeLand: 3.94
PercentValidDaytimeLandInCloudShadow: 1.19
PercentValidClearDaytimeWater: 0.00
PercentValidDaytimeWaterInCloudShadow: 0.00# NDVI is virtual: the data are references back to the original NetCDF file.
vds["NDVI"].data2. Write one Kerchunk reference artifact per NetCDF
This is intentionally serial and simple for teaching. In a real batch run, each machine could run this same loop for its assigned files.
With REF_FORMAT = "json", each output is one .json file. With REF_FORMAT = "parquet", each output is a .parquet directory.
# Start clean for the demo so stale refs do not cause confusing errors.
shutil.rmtree(ref_dir, ignore_errors=True)
ref_dir.mkdir(exist_ok=True)
ref_files = []
for url in urls:
out = ref_dir / Path(url).name.replace(".nc", REF_SUFFIX)
vds = open_virtual_dataset(
url=url,
parser=parser,
registry=registry,
loadable_variables=["time", "latitude", "longitude"],
decode_times=True,
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")
vds = clean_attrs(vds)
vds.vz.to_kerchunk(
filepath=str(out),
format=REF_FORMAT,
)
ref_files.append(out)
print("wrote", out)
ref_files = sorted(ref_files)
ref_fileswrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.json
wrote ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.json
[PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.json'),
PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.json'),
PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.json'),
PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.json'),
PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.json')]
3. Reopen the reference artifacts and concatenate
Important: we reopen the Kerchunk refs with the parser directly instead of open_virtual_dataset(...). That avoids an xarray/Zarr v3 metadata decoding path that can fail on these files.
The files have a scalar CRS coordinate in the original NetCDF metadata, so we do not use coordinate-based combining. We explicitly concatenate in filename order along time.
ref_files = sorted(ref_dir.glob(f"*{REF_SUFFIX}"))
print(len(ref_files), "reference artifacts")
ref_files[:2]5 reference artifacts
[PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.json'),
PosixPath('ndvi_refs_json/AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.json')]
ref_root = f"file://{ref_dir.resolve()}"
ref_store = from_url(ref_root)
merge_registry = ObjectStoreRegistry({
bucket: store,
ref_root: ref_store,
})
ref_parser = KerchunkJSONParser() if REF_FORMAT == "json" else KerchunkParquetParser()
vds_list = []
for ref_file in ref_files:
manifest_store = ref_parser(
url=f"{ref_root}/{ref_file.name}",
registry=merge_registry,
)
# Fully virtual dataset; avoids open_virtual_dataset(...), which internally calls xr.open_zarr(...).
ds_one = manifest_store._group.to_virtual_dataset()
ds_one = ds_one.drop_vars(["nv", "ncrs", "crs"], errors="ignore")
ds_one = clean_attrs(ds_one)
vds_list.append(ds_one)
combined_vds = xr.concat(
vds_list,
dim="time",
data_vars="minimal",
coords="minimal",
combine_attrs="drop_conflicts",
compat="override",
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")
combined_vds = clean_attrs(combined_vds)
combined_vds<xarray.Dataset> Size: 778MB
Dimensions: (latitude: 3600, nv: 2, longitude: 7200, time: 5)
Coordinates:
latitude (latitude) float32 14kB ManifestArray<shape=(3600,), dtype=flo...
longitude (longitude) float32 29kB ManifestArray<shape=(7200,), dtype=fl...
time (time) float32 20B ManifestArray<shape=(5,), dtype=float32, ch...
Dimensions without coordinates: nv
Data variables:
lat_bnds (latitude, nv) float32 29kB ManifestArray<shape=(3600, 2), dty...
lon_bnds (longitude, nv) float32 58kB ManifestArray<shape=(7200, 2), dt...
NDVI (time, latitude, longitude) int16 259MB ManifestArray<shape=(5...
TIMEOFDAY (time, latitude, longitude) int16 259MB ManifestArray<shape=(5...
QA (time, latitude, longitude) int16 259MB ManifestArray<shape=(5...
Attributes: (12/32)
institution: NASA/GSFC/SED/ESD/HBSL/TIS/MODIS-...
Conventions: CF-1.6, ACDD-1.3
standard_name_vocabulary: CF Standard Name Table (v25, 05 J...
naming_authority: gov.noaa.ncei
license: See the Use Agreement for this CD...
cdm_data_type: Grid
... ...
InputDataType: GAC
ESDT: AVH13C1
RangeBeginningTime: 00:00:00.0000
RangeEndingTime: 23:59:59.9999
PercentValidClearDaytimeWater: 0.00
PercentValidDaytimeWaterInCloudShadow: 0.004. Write the combined virtual dataset to Icechunk
The final Icechunk repo contains virtual references to the original S3 NetCDF chunks. It does not copy the NDVI data.
repo_path = Path("ndvi-icechunk-kerchunk")
shutil.rmtree(repo_path, ignore_errors=True)
config = icechunk.RepositoryConfig.default()
config.set_virtual_chunk_container(
icechunk.VirtualChunkContainer(
url_prefix="s3://noaa-cdr-ndvi-pds/",
store=icechunk.s3_store(region="us-east-1", anonymous=True),
),
)
storage = icechunk.local_filesystem_storage(str(repo_path))
repo = icechunk.Repository.create(storage, config)
session = repo.writable_session("main")
combined_vds = clean_attrs(combined_vds)
combined_vds.vz.to_icechunk(session.store)
snapshot_id = session.commit("Add 5-day NDVI CDR virtual references")
print("Committed:", snapshot_id)2026-06-04T02:47:05.659155Z WARN icechunk_arrow_object_store: The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores. at icechunk-arrow-object-store/src/lib.rs:196 Committed: 48090R4J16894CSXRW1G
5. Read back the Icechunk store
Authorize Icechunk to read the original public S3 chunks, then open the Icechunk store with xarray.
credentials = icechunk.containers_credentials({
"s3://noaa-cdr-ndvi-pds/": icechunk.s3_credentials(anonymous=True)
})
repo2 = icechunk.Repository.open(
storage,
config,
authorize_virtual_chunk_access=credentials,
)
session2 = repo2.readonly_session("main")
ds = xr.open_zarr(
session2.store,
consolidated=False,
chunks=None,
)
ds<xarray.Dataset> Size: 2GB
Dimensions: (time: 5, latitude: 3600, longitude: 7200, nv: 2)
Coordinates:
* time (time) datetime64[ns] 40B 2000-01-01 2000-01-02 ... 2000-01-05
* latitude (latitude) float32 14kB 89.97 89.93 89.88 ... -89.93 -89.97
* longitude (longitude) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
Dimensions without coordinates: nv
Data variables:
NDVI (time, latitude, longitude) float64 1GB ...
QA (time, latitude, longitude) int16 259MB ...
TIMEOFDAY (time, latitude, longitude) float64 1GB ...
lat_bnds (latitude, nv) float32 29kB ...
lon_bnds (longitude, nv) float32 58kB ...
Attributes: (12/32)
institution: NASA/GSFC/SED/ESD/HBSL/TIS/MODIS-...
Conventions: CF-1.6, ACDD-1.3
standard_name_vocabulary: CF Standard Name Table (v25, 05 J...
naming_authority: gov.noaa.ncei
license: See the Use Agreement for this CD...
cdm_data_type: Grid
... ...
InputDataType: GAC
ESDT: AVH13C1
RangeBeginningTime: 00:00:00.0000
RangeEndingTime: 23:59:59.9999
PercentValidClearDaytimeWater: 0.00
PercentValidDaytimeWaterInCloudShadow: 0.00# Optional quick check: read one small-ish slice from the virtual dataset.
ds["NDVI"].isel(time=0, latitude=slice(0, 10), longitude=slice(0, 10)).valuesOptional plot
This reads data lazily from the original S3 NetCDF files through Icechunk.
import hvplot.xarray # noqa
ds["NDVI"].isel(time=0).hvplot(
rasterize=True,
geo=True,
global_extent=True,
x="longitude",
y="latitude",
tiles="OSM",
cmap="YlGn",
clim=(-0.1, 1.0),
title="AVHRR NDVI — 2000-01-01",
width=800,
height=400,
)