Loading...
Loading...
Use when writing or modifying Python code that imports `genoray` to read genotypes/dosages from VCF, PGEN, or SparseVar (`.svar`) files. Covers the public API surface, mode constants, range queries, chunking, filtering, and the SparseVar workflow. Skip for unrelated bioinformatics work.
npx skill4agent add d-laub/genoray genoray-apigenoraySparseVar.svarimport genoraygenoray.VCFgenoray.PGENgenoray.SparseVar.svargenoray.ReaderVCF | PGEN | SparseVargenoray.exprs.gvi_genoray._vcfdocs/source/index.mddocs/source/svar.mdgenoray/__init__.pygenoray/_vcf.pyVCFreadchunkgenoray/_pgen.pyPGENreadchunkread_rangeschunk_rangesgenoray/_svar.pySparseVar__init__from_vcffrom_pgenread_rangeswith_fieldsgenoray/exprs.pyis_snpis_indelis_biallelicILEN[start, end)max_mem"4g""512m""2GB""chr1""1"ContigNormalizer-1np.nanmodegenoray.VCF.Genos8 # not genoray.Genos8
genoray.PGEN.GenosPhasingDosages_vcf.py_pgen.pyGenosPGEN.GenosPhasingDosages(genos, phasing, dosages)VCF.Genos8Dosages(genos, dosages)vcf = genoray.VCF(
"file.vcf.gz",
phasing=True, # constructor-time, not per-read
dosage_field="DS", # required to read dosages; FORMAT field with Number=A
filter=lambda v: ..., # cyvcf2.Variant -> bool
)
# Single range
arr = vcf.read("chr1", start=0, end=1_000_000, mode=genoray.VCF.Genos8)
# Chunked
for chunk in vcf.chunk("chr1", start=0, end=1_000_000,
max_mem="2g", mode=genoray.VCF.Genos8Dosages):
...phasing=False(samples, ploidy=2, variants)phasing=True(samples, ploidy+1=3, variants)01(samples, variants)float32read_rangespgen = genoray.PGEN(
"hardcalls.pgen", # hardcalls live in the main path
dosage_path="dosages.pgen", # optional; defaults to the main path
filter=genoray.exprs.is_snp & genoray.exprs.is_biallelic,
)dosage_path.gvi# Single range
genos = pgen.read("chr2", start=0, end=1000)
# Multiple ranges in one call (PGEN-only optimization)
data, offsets = pgen.read_ranges(
"chr2",
starts=[0, 1000, 2000],
ends=[1000, 2000, 3000],
mode=genoray.PGEN.GenosPhasingDosages,
)
# `data` matches the mode (tuple when mode bundles multiple arrays)
# `offsets` shape: (n_ranges + 1,). Slice range i with: arr[..., offsets[i]:offsets[i+1]]
# Chunked variants of both
for chunk in pgen.chunk("chr2", 0, 1000, max_mem="4g"): ...
for range_iter in pgen.chunk_ranges("chr2", starts, ends, max_mem="4g"):
for chunk in range_iter: ...int32float32bool(samples, variants)phasing=True.svar# From a configured VCF reader
vcf = genoray.VCF("file.vcf.gz", dosage_field="DS")
genoray.SparseVar.from_vcf("out.svar", vcf, max_mem="4g",
with_dosages=True, overwrite=True)
# Or from a PGEN
genoray.SparseVar.from_pgen("out.svar", "file.pgen", max_mem="4g")# Plain ragged: data is just variant indices
svar = genoray.SparseVar("out.svar")
ragged = svar.read_ranges("chr1", starts=[0, 50_000], ends=[10_000, 60_000],
samples=["S1", "S2"])
# shape: (ranges, samples, ploidy, ~variants) — last axis is ragged
# With extra fields attached
svar = genoray.SparseVar("out.svar", fields={"dosages": np.float32})
# or, on an existing instance:
svar_with = svar.with_fields({"dosages": np.float32})
result = svar_with.read_ranges("chr1", [0], [10_000])
result.genos # Ragged of variant indices (uint32)
result.dosages # Ragged of dosages (float32)with_fields(False)Ragged[V_IDX_TYPE]svar.indexDataFrameCHROM, POS, REF, ALT (list[str]), ILENv_idxs = ragged[0, 0, 0].to_numpy()
rows = svar.index[v_idxs.tolist()].select("CHROM", "POS", "REF", "ALT")svar.index.POSCallable[[cyvcf2.Variant], bool]filter=pl.Expr.gvigenoray.exprsis_snpis_indelis_biallelicILENpl.col(...).gvigenoray/exprs.pyexprs&|import polars as pl| Mistake | Fix |
|---|---|
| |
| Set |
Reading dosages from a VCF without | Pass |
| Putting a dosage-only PGEN in the main path when you also have hardcalls | Hardcalls in main path, dosages in |
Importing | Use |
Expecting VCF to have | VCF doesn't; loop over single-range |
Treating | It's 1-based; subtract 1 to compare with query coords |
Calling | PGEN returns |
import genorayCLAUDE.md