# Colab users, uncomment and run this
#!pip install -q icechunk virtualizarr xarray obspec_utils obstore hvplotVirtualiZarr → Icechunk → Append days
Workflow
- Open a single nc file on S3 and create a virtual dataset
- Write virtual references to an Icechunk store
- Open the next nc file on S3 and create a virtual dataset
- Append to the Icechunk store
- Repeat
import warnings
import shutil
from pathlib import Path
import xarray as xr
import icechunk
from obstore.store import from_url
from virtualizarr import open_virtual_dataset, open_virtual_mfdataset
from virtualizarr.parsers import HDFParser
from obspec_utils.registry import ObjectStoreRegistry
warnings.filterwarnings(
"ignore",
message="Numcodecs codecs are not in the Zarr version 3 specification*",
category=UserWarning,
)Set up the urls to the object storage
bucket = "s3://noaa-cdr-ndvi-pds"
base = "data/2000"
# 5 consecutive daily files — January 2000
filenames = [
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc",
"AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc",
]
urls = [f"{bucket}/{base}/{f}" for f in filenames]
urls[0]'s3://noaa-cdr-ndvi-pds/data/2000/AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc'
import xarray as xr
url = urls[0]
ds = xr.open_dataset(
url,
engine="h5netcdf",
backend_kwargs={
"storage_options": {
"anon": True,
"client_kwargs": {"region_name": "us-east-1"},
}
},
)
ds<xarray.Dataset> Size: 467MB
Dimensions: (ncrs: 1, latitude: 3600, nv: 2, longitude: 7200, time: 1)
Coordinates:
* latitude (latitude) float32 14kB 89.97 89.93 89.88 ... -89.93 -89.97
* longitude (longitude) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
* time (time) datetime64[ns] 8B 2000-01-01
Dimensions without coordinates: ncrs, nv
Data variables:
crs (ncrs) int16 2B ...
lat_bnds (latitude, nv) float32 29kB ...
lon_bnds (longitude, nv) float32 58kB ...
NDVI (time, latitude, longitude) float64 207MB ...
TIMEOFDAY (time, latitude, longitude) datetime64[ns] 207MB ...
QA (time, latitude, longitude) int16 52MB ...
Attributes: (12/48)
title: Normalized Difference Vegetation ...
institution: NASA/GSFC/SED/ESD/HBSL/TIS/MODIS-...
Conventions: CF-1.6, ACDD-1.3
standard_name_vocabulary: CF Standard Name Table (v25, 05 J...
naming_authority: gov.noaa.ncei
license: See the Use Agreement for this CD...
... ...
PercentValidDaytimeData: 32.03
PercentValidDaytimeLand: 32.03
PercentValidClearDaytimeLand: 3.94
PercentValidDaytimeLandInCloudShadow: 1.19
PercentValidClearDaytimeWater: 0.00
PercentValidDaytimeWaterInCloudShadow: 0.00Setup: S3 store and registry
Point obstore at the public NDVI bucket and register it so VirtualiZarr can resolve chunk references.
# Create an object-store handle for the remote files.
# Here the files are public NetCDF/HDF-style files in an S3 bucket, so we use
# region="us-east-1" and skip_signature=True for anonymous access.
store = from_url(bucket, region="us-east-1", skip_signature=True)
# Register the store with VirtualiZarr.
# The registry tells VirtualiZarr: "when you see URLs that start with this bucket,
# use this object store to read them."
registry = ObjectStoreRegistry({bucket: store})
# Choose the parser that matches the file format you are virtualizing.
# These NOAA NDVI files are NetCDF4/HDF5-style files, so we use HDFParser.
# For other source formats, you would choose a different parser.
parser = HDFParser()Common VirtualiZarr parsers
| Parser | Source format | Example use |
|---|---|---|
HDFParser |
NetCDF4 / HDF5 files | Many .nc files from NASA/NOAA archives |
NetCDF3Parser |
Classic NetCDF-3 files | Older .nc files that are not HDF5 internally |
DMRPPParser |
DMR++ metadata files | NASA Earthdata / OPeNDAP-style cloud metadata sidecars |
ZarrParser |
Existing Zarr stores | Virtualizing or re-referencing an existing .zarr store |
IcechunkParser |
Existing Icechunk repositories | Opening an existing Icechunk repo as a virtual source |
KerchunkJSONParser |
Existing Kerchunk JSON reference files | Reopening refs written with to_kerchunk(format="json") |
KerchunkParquetParser |
Existing Kerchunk Parquet reference stores | Reopening refs written with to_kerchunk(format="parquet") |
FITSParser |
FITS files | Astronomy data |
VirtualTIFF |
TIFF / GeoTIFF-style files | Comes from the virtual_tiff package, not the main VirtualiZarr parser import list |
1. Open a single virtual dataset
open_virtual_dataset() reads the metadata from the source file and builds an xarray-like dataset whose large data variables are still virtual. In other words, the big NDVI arrays are not downloaded. VirtualiZarr records where those chunks live in the original S3 NetCDF file.
Some variables, however, are useful to load as real in-memory arrays. These are usually small coordinate variables that describe the grid or the file’s position in time. In this dataset, we load:
time: needed so xarray/Icechunk knows where this file belongs along the time axislatitude: the 1D latitude coordinate for the gridlongitude: the 1D longitude coordinate for the grid
A good rule of thumb is:
Load small coordinate variables; keep large science variables virtual.
vds = open_virtual_dataset(
url=urls[0],
parser=parser,
registry=registry,
loadable_variables=["time", "latitude", "longitude"],
decode_times=True,
)
vds<xarray.Dataset> Size: 156MB
Dimensions: (latitude: 3600, longitude: 7200, time: 1, ncrs: 1, nv: 2)
Coordinates:
* latitude (latitude) float32 14kB 89.97 89.93 89.88 ... -89.93 -89.97
* longitude (longitude) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
* time (time) datetime64[ns] 8B 2000-01-01
ncrs (ncrs) float32 4B ManifestArray<shape=(1,), dtype=float32, chu...
nv (nv) float32 8B ManifestArray<shape=(2,), dtype=float32, chunk...
Data variables:
crs (ncrs) int16 2B ManifestArray<shape=(1,), dtype=int16, chunks=...
lat_bnds (latitude, nv) float32 29kB ManifestArray<shape=(3600, 2), dty...
lon_bnds (longitude, nv) float32 58kB ManifestArray<shape=(7200, 2), dt...
NDVI (time, latitude, longitude) int16 52MB ManifestArray<shape=(1,...
TIMEOFDAY (time, latitude, longitude) int16 52MB ManifestArray<shape=(1,...
QA (time, latitude, longitude) int16 52MB ManifestArray<shape=(1,...
Attributes: (12/48)
title: Normalized Difference Vegetation ...
institution: NASA/GSFC/SED/ESD/HBSL/TIS/MODIS-...
Conventions: CF-1.6, ACDD-1.3
standard_name_vocabulary: CF Standard Name Table (v25, 05 J...
naming_authority: gov.noaa.ncei
license: See the Use Agreement for this CD...
... ...
PercentValidDaytimeData: 32.03
PercentValidDaytimeLand: 32.03
PercentValidClearDaytimeLand: 3.94
PercentValidDaytimeLandInCloudShadow: 1.19
PercentValidClearDaytimeWater: 0.00
PercentValidDaytimeWaterInCloudShadow: 0.002. Write Icestore
repo_path = Path("ndvi-icechunk-append")
if repo_path.exists():
shutil.rmtree(repo_path)
print(f"Cleared existing repo at {repo_path}/")
config = icechunk.RepositoryConfig.default()
config.set_virtual_chunk_container(
icechunk.VirtualChunkContainer(
url_prefix="s3://noaa-cdr-ndvi-pds/",
store=icechunk.s3_store(region="us-east-1", anonymous=True),
),
)
storage = icechunk.local_filesystem_storage(str(repo_path))
repo = icechunk.Repository.create(storage, config)
session = repo.writable_session("main")
vds.vz.to_icechunk(session.store)
snapshot_id = session.commit("1 days NDVI CDR Jan 2000")
print("Committed:", snapshot_id)Cleared existing repo at test-icechunk/ 2026-06-04T03:30:33.717440Z WARN icechunk_arrow_object_store: The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores. at icechunk-arrow-object-store/src/lib.rs:196 Committed: F91FSASSNDE47GM4547G
3. Append next nc to the Icechunk
# Open the next NetCDF file as a virtual dataset.
# This is the same pattern as the first file, but now we use urls[1].
vds2 = open_virtual_dataset(
url=urls[1],
parser=parser,
registry=registry,
loadable_variables=["time", "latitude", "longitude"],
decode_times=True,
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")
# We already created the repo and wrote the first day.
# Now we append the second day along the time dimension.
repo = icechunk.Repository.open(storage)
session = repo.writable_session("main")
vds2.vz.to_icechunk(
session.store,
append_dim="time",
)
snapshot_id = session.commit("Add day 2 NDVI CDR Jan 2000")
print("Committed:", snapshot_id)Committed: P3HS05AS0H9V6JC51830
Let’s do all 5 files in a for loop
# Clear out the icechunk
if repo_path.exists():
shutil.rmtree(repo_path)
print(f"Cleared existing repo at {repo_path}/")
config = icechunk.RepositoryConfig.default()
config.set_virtual_chunk_container(
icechunk.VirtualChunkContainer(
url_prefix="s3://noaa-cdr-ndvi-pds/",
store=icechunk.s3_store(region="us-east-1", anonymous=True),
),
)
storage = icechunk.local_filesystem_storage(str(repo_path))
repo = icechunk.Repository.create(storage, config)
session = repo.writable_session("main")Cleared existing repo at test-icechunk/ 2026-06-04T03:49:44.102549Z WARN icechunk_arrow_object_store: The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores. at icechunk-arrow-object-store/src/lib.rs:196
# Run the for loop
import time
from pathlib import Path
for i, url in enumerate(urls[:5]):
filename = Path(url).name
start = time.perf_counter()
print(f"Adding {filename}")
vds = open_virtual_dataset(
url=url,
parser=parser,
registry=registry,
loadable_variables=["time", "latitude", "longitude"],
decode_times=True,
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")
if i == 0:
vds.vz.to_icechunk(session.store)
else:
vds.vz.to_icechunk(session.store, append_dim="time")
elapsed = time.perf_counter() - start
print(f"Finished {filename} in {elapsed:.2f} seconds")
snapshot_id = session.commit("5 days NDVI CDR Jan 2000")
print("Committed:", snapshot_id)Adding AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc
Finished AVHRR-Land_v005_AVH13C1_NOAA-14_20000101_c20170623095628.nc in 2.03 seconds
Adding AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc
Finished AVHRR-Land_v005_AVH13C1_NOAA-14_20000102_c20170623101557.nc in 1.48 seconds
Adding AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc
Finished AVHRR-Land_v005_AVH13C1_NOAA-14_20000103_c20170623103338.nc in 1.49 seconds
Adding AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc
Finished AVHRR-Land_v005_AVH13C1_NOAA-14_20000104_c20170623105028.nc in 1.47 seconds
Adding AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc
Finished AVHRR-Land_v005_AVH13C1_NOAA-14_20000105_c20170623110559.nc in 1.45 seconds
Committed: EMHXA5DM6VQ40581GK3G
# So about 1.5 hours to do 10 years
365*10*1.5/(60*60)1.5208333333333333
Read back and plot
Open the Icechunk store with xarray — all 5 days appear as a single continuous dataset. Chunk data is fetched lazily from S3 on demand.
from pathlib import Path
import icechunk
# Set up the icechunk repo
repo_path = Path("test-icechunk")
# Tell it how to authenticate to the underlying nc data
credentials = icechunk.containers_credentials({
"s3://noaa-cdr-ndvi-pds/": icechunk.s3_credentials(anonymous=True)
})
# Create the default Icechunk repository configuration.
# The configuration controls how the Icechunk repo stores metadata
config = icechunk.RepositoryConfig.default()
# Tell Icechunk that some chunks in this dataset will not be stored inside
# the Icechunk repository itself. Instead, they will remain as references to
# byte ranges in the original NetCDF files in this public S3 bucket.
#
# The url_prefix must match the beginning of the source file URLs used by
# VirtualiZarr. store tells Icechunk how to access those original source files.
# anonymous=True is used because this NOAA bucket is public.
config.set_virtual_chunk_container(
icechunk.VirtualChunkContainer(
url_prefix="s3://noaa-cdr-ndvi-pds/",
store=icechunk.s3_store(region="us-east-1", anonymous=True),
),
)
# Point Icechunk at the local directory where the repository was written.
# This is the same repo_path used earlier when creating the Icechunk repo.
storage = icechunk.local_filesystem_storage(str(repo_path))
# Open the existing Icechunk repository for reading.
#
# Because this Icechunk dataset contains virtual chunks that point back to
# the original NOAA S3 NetCDF files, we also provide credentials that authorize
# Icechunk to read from that external S3 location for THIS read session.
repo2 = icechunk.Repository.open(
storage,
config,
authorize_virtual_chunk_access=credentials,
)
# Open a read-only session on the main branch.
session2 = repo2.readonly_session("main")2026-06-04T04:00:30.215005Z WARN icechunk_arrow_object_store: The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores. at icechunk-arrow-object-store/src/lib.rs:196
Read the data
import xarray as xr
ds = xr.open_zarr(session2.store, consolidated=False, chunks=None)
ds/srv/conda/envs/notebook/lib/python3.12/site-packages/zarr/codecs/numcodecs/_codecs.py:141: ZarrUserWarning: Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations.
super().__init__(**codec_config)
<xarray.Dataset> Size: 2GB
Dimensions: (time: 5, latitude: 3600, longitude: 7200, nv: 2)
Coordinates:
* time (time) datetime64[ns] 40B 2000-01-01 2000-01-02 ... 2000-01-05
* latitude (latitude) float32 14kB 89.97 89.93 89.88 ... -89.93 -89.97
* longitude (longitude) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
Dimensions without coordinates: nv
Data variables:
NDVI (time, latitude, longitude) float64 1GB ...
lat_bnds (latitude, nv) float32 29kB ...
QA (time, latitude, longitude) int16 259MB ...
lon_bnds (longitude, nv) float32 58kB ...
TIMEOFDAY (time, latitude, longitude) datetime64[ns] 1GB ...
Attributes: (12/48)
title: Normalized Difference Vegetation ...
institution: NASA/GSFC/SED/ESD/HBSL/TIS/MODIS-...
Conventions: CF-1.6, ACDD-1.3
standard_name_vocabulary: CF Standard Name Table (v25, 05 J...
naming_authority: gov.noaa.ncei
license: See the Use Agreement for this CD...
... ...
PercentValidDaytimeData: 32.01
PercentValidDaytimeLand: 32.01
PercentValidClearDaytimeLand: 3.14
PercentValidDaytimeLandInCloudShadow: 1.04
PercentValidClearDaytimeWater: 0.00
PercentValidDaytimeWaterInCloudShadow: 0.00Make a plot
import hvplot.xarray # noqa
# Global NDVI map for the first day
ds["NDVI"].isel(time=0).hvplot(rasterize=True, geo=True, global_extent=True,
x="longitude", y="latitude", tiles='OSM',
cmap="YlGn", clim=(-0.1, 1.0),
title="AVHRR NDVI — 2000-01-01",
width=800, height=400,
)# Global mean NDVI over the 5-day period
ds["NDVI"].mean(["latitude", "longitude"]).hvplot(
title="Global mean NDVI — Jan 1–5, 2000",
ylabel="NDVI",
)