From 59d55cbaa3f3e323167a0ba8ee262b7218a709e2 Mon Sep 17 00:00:00 2001 From: adisbladis Date: Sun, 16 Nov 2025 21:14:20 +1300 Subject: [PATCH] lib.sources.sourceByGlobs: init function Adds a source filtering function inspired by [doublestar](https://github.com/bmatcuk/doublestar). This has been in used in a few private repositories since the last ~6 months with success. - Testing This was originally tested with the nix-unit testsuite: ``` let inherit (import ./internal.nix) mkSourceFilter mkMatcher; in { mkMatcher = { empty = { testMatch = { expr = mkMatcher "" "" "regular"; expected = true; }; testNoMatch = { expr = mkMatcher "" "foo" "regular"; expected = false; }; }; simple = { testMatch = { expr = mkMatcher "foo" "foo" "regular"; expected = true; }; testNoMatch = { expr = mkMatcher "foo" "bar" "regular"; expected = false; }; }; singleStar = { testMatch = { expr = mkMatcher "*.js" "foo.js" "regular"; expected = true; }; testNoMatch = { expr = mkMatcher "*.js" "foo.py" "regular"; expected = false; }; }; doubleStar = { testMatch = { expr = mkMatcher "foo/**/bar" "foo/baz/bar" "regular"; expected = true; }; testNoMatch = { expr = mkMatcher "foo/**/bar" "foo/bar/baz" "regular"; expected = false; }; testMultiMatch = { expr = mkMatcher "foo/**/bar" "foo/baz/xyz/bar" "regular"; expected = true; }; testMultiMatchDoubleGlob = { expr = mkMatcher "foo/**/**/bar" "foo/baz/xyz/bar" "regular"; expected = true; }; testInfixMatch = { expr = mkMatcher "foo/**/qux/**/bar" "foo/baz/qux/baz/bar" "regular"; expected = true; }; testInfixNoMatch = { expr = mkMatcher "foo/**/xyz/**/bar" "foo/baz/qux/baz/bar" "regular"; expected = false; }; # Technically a partial match testInfixDirMatch = { expr = mkMatcher "foo/**/xyz/**/bar" "foo/baz/qux/baz/bar" "directory"; expected = true; }; }; }; mkSourceFilter = { testSourceFilter = { expr = mkSourceFilter ./fixtures [ "bar/*.js" ] "bar/bar.js" "regular"; expected = true; }; }; } ``` but it was dropped in this nixpkgs contribution as the structure of nixpkgs lib testing is too primitive to incorp this without more extensive refactoring than I'd like at the momment. - Performance It's hard to benchmark this against anything else meaningful except [globsset](https://github.com/pdtpartners/globset), which has a very similar API. `sourceByGlobs` avoids performance pitfalls by: - Using `builtins.filterSource` This is more performant than the fileset API. The downside compared to the fileset API is that any directory which matches the filter will be added to the build, even if it's empty. - Match paths component by component By splitting each pattern into a token per / separator. This is much faster in Nix than the doublestar algorithm. - Globset source ```json { "cpuTime": 0.8585879802703857, "envs": { "bytes": 148252864, "elements": 11899843, "number": 6631765 }, "gc": { "heapSize": 402915328, "totalBytes": 671288560 }, "list": { "bytes": 3358664, "concats": 28658, "elements": 419833 }, "nrAvoided": 11562713, "nrFunctionCalls": 4816963, "nrLookups": 4316209, "nrOpUpdateValuesCopied": 5686407, "nrOpUpdates": 464060, "nrPrimOpCalls": 2966970, "nrThunks": 7796186, "sets": { "bytes": 196404672, "elements": 10837802, "number": 1437490 }, "sizes": { "Attr": 16, "Bindings": 16, "Env": 8, "Value": 24 }, "symbols": { "bytes": 340652, "number": 32026 }, "values": { "bytes": 207367440, "number": 8640310 } } ``` - Glob-filter source ```json { "cpuTime": 0.3904629945755005, "envs": { "bytes": 13263440, "elements": 1005877, "number": 652053 }, "gc": { "heapSize": 402915328, "totalBytes": 146914896 }, "list": { "bytes": 3032168, "concats": 5899, "elements": 379021 }, "nrAvoided": 1666598, "nrFunctionCalls": 484399, "nrLookups": 112698, "nrOpUpdateValuesCopied": 3432135, "nrOpUpdates": 13426, "nrPrimOpCalls": 1041954, "nrThunks": 1205792, "sets": { "bytes": 64304800, "elements": 3978167, "number": 40883 }, "sizes": { "Attr": 16, "Bindings": 16, "Env": 8, "Value": 24 }, "symbols": { "bytes": 285306, "number": 28864 }, "values": { "bytes": 42963240, "number": 1790135 } } ``` --- lib/sources.nix | 109 +++++++++++++++++++++++++++++++++++++++++++ lib/tests/sources.sh | 12 +++++ 2 files changed, 121 insertions(+) diff --git a/lib/sources.nix b/lib/sources.nix index 43bc2ab2dbe3..81c5305842a1 100644 --- a/lib/sources.nix +++ b/lib/sources.nix @@ -7,12 +7,19 @@ let match split storeDir + escapeRegex + removePrefix ; inherit (lib) boolToString filter isString readFile + concatStrings + length + elemAt + isList + any ; inherit (lib.filesystem) pathIsRegularFile @@ -513,6 +520,107 @@ let else throw "repoRevToName: invalid kind"; + /** + Filter sources by a list of double star glob patterns. + + # Inputs + + `src` + + : 1\. Function argument + + `patterns` + + : 2\. Function argument + + # Examples + :::{.example} + ## `sourceByGlobs` usage example + + - Include all .py files recursively + ```nix + src = sourceByGlobs ./my-subproject ["**\/*.py" ] + ``` + + - Include all .py files in root directory only + ```nix + src = sourceByGlobs ./my-subproject ["*.py" ] + ``` + + ::: + */ + sourceByGlobs = + let + splitPath = path: filter isString (split "\/" path); + # Make component regex + mkRe = + s: + if s == "**" then + ".*" # Has special handling below + else + concatStrings (map (tok: if isList tok then "[^\/]*" else escapeRegex tok) (split "\\*+" s)); + + # Make a source filter function from pattern + mkMatcher = + pat: + let + globs = map mkRe (splitPath pat); + glen = length globs; + in + path: type: + let + path' = splitPath path; + plen = length path'; + + recurse = + gi: pi: + let + g = elemAt globs gi; + p = elemAt path' pi; + m = match g p != null; + in + if pi >= plen then # Reached end of path + gi >= glen || (type == "directory" || type == "symlink") # Only allow partial matches for directories + else if gi >= glen then # Reached end of globs + false + else if g == ".*" then # Special handling for ** + ( + # Lookahead for next glob match + if (gi + 1) == glen then + true + else if (match (elemAt globs (gi + 1)) p != null) then + recurse (gi + 1) pi + else if m then + recurse gi (pi + 1) + else + false + ) + else if m then + recurse (gi + 1) (pi + 1) + else + false; + + in + recurse 0 0; + + mkSourceFilter = + root: patterns: + let + root' = "${toString root}/"; + matchers = map mkMatcher patterns; + in + name: type: + let + name' = removePrefix root' name; + in + any (m: m name' type) matchers; + + in + src: patterns: + lib.cleanSourceWith { + filter = mkSourceFilter src patterns; + inherit src; + }; in { inherit @@ -532,6 +640,7 @@ in sourceByRegex sourceFilesBySuffices + sourceByGlobs trace ; diff --git a/lib/tests/sources.sh b/lib/tests/sources.sh index 079c7eea5657..08315a303bc2 100755 --- a/lib/tests/sources.sh +++ b/lib/tests/sources.sh @@ -70,4 +70,16 @@ dir="$(nix-instantiate --eval --strict --read-write-mode --json --expr '(with im EOF ) || die "cleanSourceWith + cleanSource" + +dir="$(nix-instantiate --eval --strict --read-write-mode --json --expr '(with import ; "${ + sources.sourceByGlobs '"$work"' [ "*.md" "**/*.o" ] +}")' | crudeUnquoteJSON)" +(cd "$dir"; find) | sort -f | diff -U10 - <(cat <&2 tests ok