lib.sources.sourceByGlobs: init function

Adds a source filtering function inspired by [doublestar](https://github.com/bmatcuk/doublestar).

This has been in used in a few private repositories since the last ~6 months with success.

- Testing

This was originally tested with the nix-unit testsuite:
```
let
  inherit (import ./internal.nix) mkSourceFilter mkMatcher;
in
{
  mkMatcher = {
    empty = {
      testMatch = {
        expr = mkMatcher "" "" "regular";
        expected = true;
      };

      testNoMatch = {
        expr = mkMatcher "" "foo" "regular";
        expected = false;
      };
    };

    simple = {
      testMatch = {
        expr = mkMatcher "foo" "foo" "regular";
        expected = true;
      };

      testNoMatch = {
        expr = mkMatcher "foo" "bar" "regular";
        expected = false;
      };
    };

    singleStar = {
      testMatch = {
        expr = mkMatcher "*.js" "foo.js" "regular";
        expected = true;
      };

      testNoMatch = {
        expr = mkMatcher "*.js" "foo.py" "regular";
        expected = false;
      };
    };

    doubleStar = {
      testMatch = {
        expr = mkMatcher "foo/**/bar" "foo/baz/bar" "regular";
        expected = true;
      };

      testNoMatch = {
        expr = mkMatcher "foo/**/bar" "foo/bar/baz" "regular";
        expected = false;
      };

      testMultiMatch = {
        expr = mkMatcher "foo/**/bar" "foo/baz/xyz/bar" "regular";
        expected = true;
      };

      testMultiMatchDoubleGlob = {
        expr = mkMatcher "foo/**/**/bar" "foo/baz/xyz/bar" "regular";
        expected = true;
      };

      testInfixMatch = {
        expr = mkMatcher "foo/**/qux/**/bar" "foo/baz/qux/baz/bar" "regular";
        expected = true;
      };

      testInfixNoMatch = {
        expr = mkMatcher "foo/**/xyz/**/bar" "foo/baz/qux/baz/bar" "regular";
        expected = false;
      };

      # Technically a partial match
      testInfixDirMatch = {
        expr = mkMatcher "foo/**/xyz/**/bar" "foo/baz/qux/baz/bar" "directory";
        expected = true;
      };
    };
  };

  mkSourceFilter = {
    testSourceFilter = {
      expr = mkSourceFilter ./fixtures [
        "bar/*.js"
      ] "bar/bar.js" "regular";
      expected = true;
    };
  };
}
```
but it was dropped in this nixpkgs contribution as the structure of nixpkgs lib testing is too primitive to incorp this without more extensive refactoring than I'd like at the momment.

- Performance

It's hard to benchmark this against anything else meaningful except [globsset](https://github.com/pdtpartners/globset), which has a very similar API.

`sourceByGlobs` avoids performance pitfalls by:

  - Using `builtins.filterSource`

      This is more performant than the fileset API.
      The downside compared to the fileset API is that any directory which matches the filter will be added to the build, even if it's empty.

  - Match paths component by component

      By splitting each pattern into a token per / separator.
      This is much faster in Nix than the doublestar algorithm.

- Globset source

```json
{
    "cpuTime": 0.8585879802703857,
    "envs": {
        "bytes": 148252864,
        "elements": 11899843,
        "number": 6631765
    },
    "gc": {
        "heapSize": 402915328,
        "totalBytes": 671288560
    },
    "list": {
        "bytes": 3358664,
        "concats": 28658,
        "elements": 419833
    },
    "nrAvoided": 11562713,
    "nrFunctionCalls": 4816963,
    "nrLookups": 4316209,
    "nrOpUpdateValuesCopied": 5686407,
    "nrOpUpdates": 464060,
    "nrPrimOpCalls": 2966970,
    "nrThunks": 7796186,
    "sets": {
        "bytes": 196404672,
        "elements": 10837802,
        "number": 1437490
    },
    "sizes": {
        "Attr": 16,
        "Bindings": 16,
        "Env": 8,
        "Value": 24
    },
    "symbols": {
        "bytes": 340652,
        "number": 32026
    },
    "values": {
        "bytes": 207367440,
        "number": 8640310
    }
}
```

- Glob-filter source

```json
{
    "cpuTime": 0.3904629945755005,
    "envs": {
        "bytes": 13263440,
        "elements": 1005877,
        "number": 652053
    },
    "gc": {
        "heapSize": 402915328,
        "totalBytes": 146914896
    },
    "list": {
        "bytes": 3032168,
        "concats": 5899,
        "elements": 379021
    },
    "nrAvoided": 1666598,
    "nrFunctionCalls": 484399,
    "nrLookups": 112698,
    "nrOpUpdateValuesCopied": 3432135,
    "nrOpUpdates": 13426,
    "nrPrimOpCalls": 1041954,
    "nrThunks": 1205792,
    "sets": {
        "bytes": 64304800,
        "elements": 3978167,
        "number": 40883
    },
    "sizes": {
        "Attr": 16,
        "Bindings": 16,
        "Env": 8,
        "Value": 24
    },
    "symbols": {
        "bytes": 285306,
        "number": 28864
    },
    "values": {
        "bytes": 42963240,
        "number": 1790135
    }
}
```
This commit is contained in:
adisbladis
2025-11-16 21:14:20 +13:00
parent 60dd3b28f0
commit 59d55cbaa3
2 changed files with 121 additions and 0 deletions

View File

@@ -7,12 +7,19 @@ let
match match
split split
storeDir storeDir
escapeRegex
removePrefix
; ;
inherit (lib) inherit (lib)
boolToString boolToString
filter filter
isString isString
readFile readFile
concatStrings
length
elemAt
isList
any
; ;
inherit (lib.filesystem) inherit (lib.filesystem)
pathIsRegularFile pathIsRegularFile
@@ -513,6 +520,107 @@ let
else else
throw "repoRevToName: invalid kind"; throw "repoRevToName: invalid kind";
/**
Filter sources by a list of double star glob patterns.
# Inputs
`src`
: 1\. Function argument
`patterns`
: 2\. Function argument
# Examples
:::{.example}
## `sourceByGlobs` usage example
- Include all .py files recursively
```nix
src = sourceByGlobs ./my-subproject ["**\/*.py" ]
```
- Include all .py files in root directory only
```nix
src = sourceByGlobs ./my-subproject ["*.py" ]
```
:::
*/
sourceByGlobs =
let
splitPath = path: filter isString (split "\/" path);
# Make component regex
mkRe =
s:
if s == "**" then
".*" # Has special handling below
else
concatStrings (map (tok: if isList tok then "[^\/]*" else escapeRegex tok) (split "\\*+" s));
# Make a source filter function from pattern
mkMatcher =
pat:
let
globs = map mkRe (splitPath pat);
glen = length globs;
in
path: type:
let
path' = splitPath path;
plen = length path';
recurse =
gi: pi:
let
g = elemAt globs gi;
p = elemAt path' pi;
m = match g p != null;
in
if pi >= plen then # Reached end of path
gi >= glen || (type == "directory" || type == "symlink") # Only allow partial matches for directories
else if gi >= glen then # Reached end of globs
false
else if g == ".*" then # Special handling for **
(
# Lookahead for next glob match
if (gi + 1) == glen then
true
else if (match (elemAt globs (gi + 1)) p != null) then
recurse (gi + 1) pi
else if m then
recurse gi (pi + 1)
else
false
)
else if m then
recurse (gi + 1) (pi + 1)
else
false;
in
recurse 0 0;
mkSourceFilter =
root: patterns:
let
root' = "${toString root}/";
matchers = map mkMatcher patterns;
in
name: type:
let
name' = removePrefix root' name;
in
any (m: m name' type) matchers;
in
src: patterns:
lib.cleanSourceWith {
filter = mkSourceFilter src patterns;
inherit src;
};
in in
{ {
inherit inherit
@@ -532,6 +640,7 @@ in
sourceByRegex sourceByRegex
sourceFilesBySuffices sourceFilesBySuffices
sourceByGlobs
trace trace
; ;

View File

@@ -70,4 +70,16 @@ dir="$(nix-instantiate --eval --strict --read-write-mode --json --expr '(with im
EOF EOF
) || die "cleanSourceWith + cleanSource" ) || die "cleanSourceWith + cleanSource"
dir="$(nix-instantiate --eval --strict --read-write-mode --json --expr '(with import <nixpkgs/lib>; "${
sources.sourceByGlobs '"$work"' [ "*.md" "**/*.o" ]
}")' | crudeUnquoteJSON)"
(cd "$dir"; find) | sort -f | diff -U10 - <(cat <<EOF
.
./module.o
./README.md
EOF
) || die "sourceByGlobs 1"
echo >&2 tests ok echo >&2 tests ok