{ lib, buildPythonPackage, fetchFromGitHub, # build-system setuptools, # dependencies accelerate, datasets, dill, evaluate, jinja2, jsonlines, more-itertools, peft, pytablewriter, rouge-score, sacrebleu, scikit-learn, sqlitedict, torch, transformers, typing-extensions, word2number, zstandard, # optional-dependencies aiohttp, immutabledict, langdetect, librosa, nltk, numpy, optimum, pandas, pymorphy2, requests, sentencepiece, soundfile, statsmodels, tenacity, tiktoken, tqdm, vllm, wandb, # tests pytestCheckHook, writableTmpDirAsHomeHook, }: buildPythonPackage (finalAttrs: { pname = "lm-eval"; version = "0.4.11"; pyproject = true; src = fetchFromGitHub { owner = "EleutherAI"; repo = "lm-evaluation-harness"; tag = "v${finalAttrs.version}"; hash = "sha256-+zhZ+I+gzoF7g0xYvlPbZFcFy2PuFOgNTFLvbmdE1R0="; }; build-system = [ setuptools ]; dependencies = [ datasets dill evaluate jinja2 jsonlines more-itertools pytablewriter rouge-score sacrebleu scikit-learn sqlitedict typing-extensions word2number zstandard ]; optional-dependencies = { api = [ aiohttp requests tenacity tiktoken tqdm ]; audiolm_qwen = [ librosa soundfile ]; discrim_eval = [ statsmodels ]; hf = [ accelerate peft torch transformers ]; ifeval = [ immutabledict langdetect nltk ]; libra = [ pymorphy2 ]; optimum = [ optimum ] ++ optimum.optional-dependencies.openvino; sentencepiece = [ sentencepiece ]; vllm = [ vllm ]; wandb = [ numpy pandas wandb ]; # Still missing dependencies for the following optional dependency groups: # - acpbench # - deepsparse # - gptq # - gptqmodel # - ibm_watsonx_ai # - ipex # - japanese_leaderboard # - longbench # - math # - multilingual # - ruler # - sparsify # - tasks # - unitxt # - zeno }; pythonRelaxDeps = [ "datasets" ]; pythonImportsCheck = [ "lm_eval" ]; nativeCheckInputs = [ pytestCheckHook sentencepiece writableTmpDirAsHomeHook ] ++ finalAttrs.passthru.optional-dependencies.api ++ finalAttrs.passthru.optional-dependencies.hf; disabledTests = [ "test_deepsparse" # deepsparse is not available # download models from the internet "test_get_batched_requests_with_no_ssl" "test_model_tokenized_call_usage" ]; disabledTestPaths = [ # attempts to download models "tests/models/test_bos_handling.py" "tests/models/test_huggingface.py" "tests/test_evaluator.py" "tests/test_include_path.py" "tests/test_prompt.py" "tests/test_task_manager.py" "tests/test_tasks.py" "tests/test_unitxt_tasks.py" # optimum-intel is not available "tests/models/test_openvino.py" # zeno-client is not packaged "tests/scripts/test_zeno_visualize.py" ]; meta = { changelog = "https://github.com/EleutherAI/lm-evaluation-harness/releases/tag/${finalAttrs.src.tag}"; description = "Framework for few-shot evaluation of language models"; homepage = "https://github.com/EleutherAI/lm-evaluation-harness"; license = [ lib.licenses.mit ]; maintainers = [ lib.maintainers.booxter ]; }; })