{ lib, stdenv, buildPythonPackage, fetchFromGitHub, # build-system hatchling, # dependencies beautifulsoup4, defusedxml, ffmpeg-headless, lxml, magika, mammoth, markdownify, numpy, olefile, openai, openpyxl, pandas, pathvalidate, pdfminer-six, puremagic, pydub, python-pptx, requests, speechrecognition, xlrd, youtube-transcript-api, # tests pytestCheckHook, # passthru gitUpdater, }: let isNotAarch64Linux = !(stdenv.hostPlatform.isLinux && stdenv.hostPlatform.isAarch64); in buildPythonPackage (finalAttrs: { pname = "markitdown"; version = "0.1.4"; pyproject = true; src = fetchFromGitHub { owner = "microsoft"; repo = "markitdown"; tag = "v${finalAttrs.version}"; hash = "sha256-WKA2eY8wY3SM9xZ7Cek5eUcJbO5q6eMDx2aTKfQnFvE="; }; sourceRoot = "${finalAttrs.src.name}/packages/markitdown"; build-system = [ hatchling ]; pythonRelaxDeps = [ "magika" ]; dependencies = [ beautifulsoup4 defusedxml ffmpeg-headless lxml magika mammoth markdownify numpy olefile openai openpyxl pandas pathvalidate pdfminer-six puremagic pydub python-pptx requests speechrecognition xlrd youtube-transcript-api ]; # aarch64-linux fails cpuinfo test, because /sys/devices/system/cpu/ does not exist in the sandbox: # terminate called after throwing an instance of 'onnxruntime::OnnxRuntimeException' # # -> Skip all tests that require importing markitdown pythonImportsCheck = lib.optionals isNotAarch64Linux [ "markitdown" ]; doCheck = isNotAarch64Linux; nativeCheckInputs = [ pytestCheckHook ]; disabledTests = [ # Require network access "test_markitdown_remote" "test_module_vectors" "test_cli_vectors" "test_module_misc" ]; passthru.updateScript = gitUpdater { }; meta = { description = "Python tool for converting files and office documents to Markdown"; homepage = "https://github.com/microsoft/markitdown"; changelog = "https://github.com/microsoft/markitdown/releases/tag/${finalAttrs.src.tag}"; license = lib.licenses.mit; maintainers = with lib.maintainers; [ malik ]; }; })