Files
2026-05-31 17:28:33 +02:00

338 lines
6.5 KiB
Nix

{
lib,
buildPythonPackage,
fetchFromGitHub,
fetchgit,
pythonOlder,
rustPlatform,
stdenv,
cargo,
nasm,
pkg-config,
rustc,
# dependencies
fsspec,
packaging,
pyarrow,
tqdm,
typing-extensions,
# optional-dependencies
av,
boto3,
clickhouse-connect,
datasets,
deltalake,
google-genai,
httpx,
huggingface-hub,
librosa,
mypy-boto3-glue,
numpy,
openai,
pandas,
pgvector,
pillow,
psycopg,
pyiceberg,
ray,
requests,
sentence-transformers,
soundfile,
sqlalchemy,
sqlglot,
torch,
torchvision,
transformers,
# tests
adlfs,
cloudpickle,
dask,
databricks-sdk,
duckdb,
gcsfs,
google-cloud-bigtable,
hypothesis,
jax,
jaxtyping,
lxml,
memray,
moto,
opencv-python,
pydantic,
pymysql,
pyodbc,
pytest-timeout,
pytest-xdist,
pytestCheckHook,
reportlab,
s3fs,
tenacity,
tiktoken,
writableTmpDirAsHomeHook,
xxhash,
}:
buildPythonPackage (finalAttrs: {
pname = "daft";
version = "0.7.14";
pyproject = true;
__structuredAttrs = true;
disabled = pythonOlder "3.10";
src = fetchFromGitHub {
owner = "Eventual-Inc";
repo = "Daft";
tag = "v${finalAttrs.version}";
hash = "sha256-qw1NB+RvXOFMHZvqpD5CSLWSUUKmtfWr0EyBgRMv2lA=";
};
cargoDeps =
(rustPlatform.importCargoLock.override {
fetchgit =
args:
if (args.url or null) == "https://github.com/Eventual-Inc/azure-sdk-for-rust" then
fetchgit (
args
// {
postFetch = (args.postFetch or "") + ''
substituteInPlace $out/services/Cargo.toml \
--replace-fail '"mgmt/batch",' '"mgmt/batch", "svc/blobstorage",'
'';
}
)
else
fetchgit args;
})
{
lockFile = ./Cargo.lock;
outputHashes = {
"azure_core-0.21.0" = "sha256-I8kzIkguRa3REwii0xsFFpNhE90/QX5msXwE6rrzDlY=";
};
};
postPatch = ''
substituteInPlace Cargo.toml \
--replace-fail 'version = "0.3.0-dev0"' 'version = "${finalAttrs.version}"'
# ArrayChunks::into_remainder() returns IntoIter<_, N> on current stable
# rustc but Option<IntoIter<_, N>> on the nightly upstream pins.
substituteInPlace src/daft-minhash/src/lib.rs \
--replace-fail 'chunks.into_remainder()' 'Some(chunks.into_remainder())'
# `hash_map_macro` was renamed/removed in newer rustc; the gate is declared
# but never used in the crate.
substituteInPlace src/daft-distributed/src/lib.rs \
--replace-fail '#![feature(hash_map_macro)]' ""
'';
nativeBuildInputs = [
cargo
pkg-config
rustc
rustPlatform.bindgenHook
rustPlatform.cargoSetupHook
rustPlatform.maturinBuildHook
]
++ lib.optional stdenv.hostPlatform.isx86_64 nasm;
dontUseCmakeConfigure = true;
env = {
DAFT_RUNNER = "native";
NIX_CFLAGS_COMPILE = "-Wno-error";
# avoid building frontend npm
CI = "1";
DAFT_DASHBOARD_SKIP_BUILD = "1";
# daft-minhash uses #![feature(portable_simd)] which requires nightly
RUSTC_BOOTSTRAP = "1";
};
pythonRelaxDeps = [ "fsspec" ];
dependencies = [
fsspec
packaging
pyarrow
tqdm
# daft/series.py imports `Self` from typing_extensions unconditionally,
# even though upstream pyproject.toml marks it python_version < '3.11'.
typing-extensions
];
optional-dependencies = {
aws = [
boto3
mypy-boto3-glue
];
azure = [ ];
clickhouse = [ clickhouse-connect ];
deltalake = [ deltalake ];
gcp = [ ];
google = [
google-genai
numpy
pillow
];
gravitino = [ requests ];
hudi = [ pyarrow ];
huggingface = [
datasets
huggingface-hub
];
iceberg = [ pyiceberg ];
numpy = [ numpy ];
openai = [
numpy
openai
pillow
];
pandas = [ pandas ];
postgres = [
pgvector
psycopg
sqlglot
];
ray = [ ray ];
transformers = [
pillow
sentence-transformers
torch
torchvision
transformers
];
sql = [
sqlalchemy
sqlglot
];
unity = [
deltalake
httpx
];
video = [ av ];
audio = [
librosa
soundfile
];
viz = [ ];
};
nativeCheckInputs = [
adlfs
av
boto3
clickhouse-connect
cloudpickle
dask
databricks-sdk
datasets
deltalake
duckdb
gcsfs
google-cloud-bigtable
google-genai
huggingface-hub
hypothesis
jax
jaxtyping
librosa
lxml
memray
moto
numpy
opencv-python
openai
pandas
pgvector
pillow
pydantic
pyiceberg
pymysql
pyodbc
pytest-timeout
pytest-xdist
pytestCheckHook
ray
reportlab
s3fs
sentence-transformers
soundfile
sqlalchemy
sqlglot
tenacity
tiktoken
torch
torchvision
transformers
writableTmpDirAsHomeHook
xxhash
];
preCheck = ''
rm -rf daft
'';
disabledTestPaths = [
"tests/integration"
"tests/benchmarks"
"tests/microbenchmarks"
# require packages: daft-lance, mcap, turbopuffer, unitycatalog, connectorx
"tests/io/lancedb"
"tests/io/mcap"
"tests/io/test_turbopuffer_write.py"
"tests/io/test_roundtrip_embeddings.py"
"tests/io/test_sql.py"
"tests/checkpoint/test_native_runner_gate.py"
"tests/dataframe/test_explain.py"
"tests/dataframe/test_limit_offset.py"
"tests/catalog/test_catalog.py"
"tests/catalog/test_unity_auth.py"
"tests/catalog/test_unity_oauth2.py"
# broken upstream
"tests/io/test_s3_credentials_refresh.py"
# require network access
"tests/ai"
"tests/sql/test_uri_exprs.py"
"tests/recordbatch/test_tokenize.py"
# spawns subprocess Python without daft on path
"tests/test_context.py"
# ray runtime OOMs
"tests/ray"
];
disabledTests = [
# writes via relative paths fail in the sandbox
"test_append_and_overwrite_local_relative_path"
# require network access
"test_to_tempfile_remote"
# broken upstream
"test_table_concat_schema_mismatch"
];
pythonImportsCheck = [ "daft" ];
meta = {
description = "Distributed dataframes for multimodal data";
homepage = "https://github.com/Eventual-Inc/Daft";
changelog = "https://github.com/Eventual-Inc/Daft/releases/tag/${finalAttrs.src.tag}";
license = lib.licenses.asl20;
maintainers = with lib.maintainers; [ derdennisop ];
mainProgram = "daft";
platforms = lib.platforms.unix;
};
})