From a88839a9011e380fec88921fb38c90838ecbcada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 8 Jan 2026 11:25:37 +0100 Subject: [PATCH 01/37] nixos/nspawn-container: add support for shared directory Co-authored-by: Jeremy Fleischman --- .../run-nspawn/src/run_nspawn/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py b/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py index 99f50038fd7c..d6fe8f4958fc 100644 --- a/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py +++ b/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py @@ -126,6 +126,7 @@ def mk_veth( def run( container_name: str, root_dir_str: str, + shared_dir_str: typing.Optional[str], interfaces: dict, nspawn_options: list[str], init: str, @@ -166,12 +167,19 @@ def run( flush=True, ) + shared_dir = Path(shared_dir_str) if shared_dir_str else None + cp = subprocess.Popen( [ "@systemd-nspawn@", *nspawn_options, f"--directory={root_dir}", f"--network-namespace-path={netns.path}", + *( + [f"--bind={shared_dir}:/tmp/shared"] + if shared_dir is not None + else [] + ), init, *cmdline, ], @@ -218,6 +226,11 @@ def main(): required=True, help="Path to container root directory (overridable with RUN_NSPAWN_ROOT_DIR)", ) + arg_parser.add_argument( + "--shared-dir", + required=False, + help="Path to a shared directory to bind-mount into the container at /tmp/shared (overridable with RUN_NSPAWN_SHARED_DIR)", + ) arg_parser.add_argument( "--interfaces-json", dest="interfaces", @@ -239,6 +252,7 @@ def main(): run( container_name=args.container_name, root_dir_str=os.getenv("RUN_NSPAWN_ROOT_DIR", default=args.root_dir), + shared_dir_str=os.getenv("RUN_NSPAWN_SHARED_DIR", default=args.shared_dir), interfaces=args.interfaces, nspawn_options=nspawn_options, init=args.init, From 628195bf0158a140128be8c781367a39ddd2d0db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 12 Jan 2026 10:02:43 +0100 Subject: [PATCH 02/37] nixos/virtualisation: correct virtualisation.vlans docstring --- nixos/modules/virtualisation/guest-networking-options.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nixos/modules/virtualisation/guest-networking-options.nix b/nixos/modules/virtualisation/guest-networking-options.nix index 817ccc4e6370..39d9eea24f3d 100644 --- a/nixos/modules/virtualisation/guest-networking-options.nix +++ b/nixos/modules/virtualisation/guest-networking-options.nix @@ -71,7 +71,7 @@ in virtualisation.vlans = lib.mkOption { type = types.listOf types.ints.unsigned; default = if cfg.interfaces == { } then [ 1 ] else [ ]; - defaultText = lib.literalExpression "if cfg.interfaces == {} then [ 1 ] else [ ]"; + defaultText = lib.literalExpression "if config.virtualisation.interfaces == {} then [ 1 ] else [ ]"; example = [ 1 2 From 197287daf45715354d13bbb4972689d98a91989d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 8 Jan 2026 11:26:58 +0100 Subject: [PATCH 03/37] nixos/test-driver: fix typo Co-authored-by: Jeremy Fleischman --- nixos/lib/test-driver/src/test_driver/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 422cf3917233..557cb1ff8142 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -16,7 +16,7 @@ from test_driver.logger import ( class EnvDefault(argparse.Action): - """An argpars Action that takes values from the specified + """An argparse Action that takes values from the specified environment variable as the flags default value. """ From 799cafcc2338d02e0ff34063f3a60101d726e3ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 8 Jan 2026 11:59:38 +0100 Subject: [PATCH 04/37] nixos/test-driver: refactor Machine to BaseMachine and QemuMachine This prepares the driver for non-QEMU backends by abstracting the machine logic. Co-authored-by: Jeremy Fleischman --- .../lib/test-driver/src/extract-docstrings.py | 2 +- .../test-driver/src/test_driver/__init__.py | 43 +- .../lib/test-driver/src/test_driver/driver.py | 53 +- .../src/test_driver/machine/__init__.py | 826 ++++++++++-------- nixos/lib/test-script-prepend.py | 6 +- nixos/lib/testing/driver.nix | 29 +- 6 files changed, 538 insertions(+), 421 deletions(-) diff --git a/nixos/lib/test-driver/src/extract-docstrings.py b/nixos/lib/test-driver/src/extract-docstrings.py index 64850ca711f3..030ed0189704 100644 --- a/nixos/lib/test-driver/src/extract-docstrings.py +++ b/nixos/lib/test-driver/src/extract-docstrings.py @@ -51,7 +51,7 @@ def main() -> None: class_definitions = (node for node in module.body if isinstance(node, ast.ClassDef)) - machine_class = next(filter(lambda x: x.name == "Machine", class_definitions)) + machine_class = next(filter(lambda x: x.name == "BaseMachine", class_definitions)) assert machine_class is not None function_definitions = [ diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 557cb1ff8142..35c5e3b11fb6 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -71,10 +71,18 @@ def main() -> None: help="Enable interactive debugging breakpoints for sandboxed runs", ) arg_parser.add_argument( - "--start-scripts", - metavar="START-SCRIPT", + "--vm-names", + metavar="VM-NAME", action=EnvDefault, - envvar="startScripts", + envvar="vmNames", + nargs="*", + help="names of participating virtual machines", + ) + arg_parser.add_argument( + "--vm-start-scripts", + metavar="VM-START-SCRIPT", + action=EnvDefault, + envvar="vmStartScripts", nargs="*", help="start scripts for participating virtual machines", ) @@ -138,14 +146,20 @@ def main() -> None: if args.debug_hook_attach is not None: debugger = Debug(logger, args.debug_hook_attach) + if args.vm_names is not None and args.vm_start_scripts is not None: + assert len(args.vm_names) == len(args.vm_start_scripts), ( + f"the number of vm names and vm start scripts must be the same: {args.vm_names} vs. {args.vm_start_scripts}" + ) + with Driver( - args.start_scripts, - args.vlans, - args.testscript.read_text(), - output_directory, - logger, - args.keep_vm_state, - args.global_timeout, + vm_names=args.vm_names, + vm_start_scripts=args.vm_start_scripts or [], + vlans=args.vlans, + tests=args.testscript.read_text(), + out_dir=output_directory, + logger=logger, + keep_vm_state=args.keep_vm_state, + global_timeout=args.global_timeout, debug=debugger, ) as driver: if offset := args.dump_vsocks: @@ -170,7 +184,14 @@ def generate_driver_symbols() -> None: in user's test scripts. That list is then used by pyflakes to lint those scripts. """ - d = Driver([], [], "", Path(), CompositeLogger([])) + d = Driver( + vm_names=[], + vm_start_scripts=[], + vlans=[], + tests="", + out_dir=Path(), + logger=CompositeLogger([]), + ) test_symbols = d.test_symbols() with open("driver-symbols", "w") as fp: fp.write(",".join(test_symbols.keys())) diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index c4f268404cbc..c8694d2ea09a 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -16,7 +16,7 @@ from colorama import Style from test_driver.debug import DebugAbstract, DebugNop from test_driver.errors import MachineError, RequestedAssertionFailed from test_driver.logger import AbstractLogger -from test_driver.machine import Machine, NixStartScript, retry +from test_driver.machine import BaseMachine, QemuMachine, retry from test_driver.polling_condition import PollingCondition from test_driver.vlan import VLan @@ -63,7 +63,7 @@ class Driver: tests: str vlans: list[VLan] - machines: list[Machine] + vm_machines: list[QemuMachine] polling_conditions: list[PollingCondition] global_timeout: int race_timer: threading.Timer @@ -72,7 +72,8 @@ class Driver: def __init__( self, - start_scripts: list[str], + vm_names: list[str] | None, + vm_start_scripts: list[str], vlans: list[int], tests: str, out_dir: Path, @@ -94,25 +95,30 @@ class Driver: vlans = list(set(vlans)) self.vlans = [VLan(nr, tmp_dir, self.logger) for nr in vlans] - def cmd(scripts: list[str]) -> Iterator[NixStartScript]: - for s in scripts: - yield NixStartScript(s) - self.polling_conditions = [] - self.machines = [ - Machine( - start_command=cmd, + self.vm_machines = [ + QemuMachine( + name=name, + start_command=vm_start_script, keep_vm_state=keep_vm_state, - name=cmd.machine_name, tmp_dir=tmp_dir, callbacks=[self.check_polling_conditions], out_dir=self.out_dir, logger=self.logger, ) - for cmd in cmd(start_scripts) + for name, vm_start_script in zip( + vm_names or (len(vm_start_scripts) * [None]), vm_start_scripts + ) ] + @property + def machines(self) -> list[QemuMachine]: + machines = self.vm_machines + # Sort the machines by name for consistency with `nodes` in . + machines.sort(key=lambda machine: machine.name) + return machines + def __enter__(self) -> "Driver": return self @@ -148,7 +154,7 @@ class Driver: general_symbols = dict( start_all=self.start_all, test_script=self.test_script, - machines=self.machines, + vm_machines=self.vm_machines, vlans=self.vlans, driver=self, log=self.logger, @@ -161,7 +167,7 @@ class Driver: serial_stdout_off=self.serial_stdout_off, serial_stdout_on=self.serial_stdout_on, polling_condition=self.polling_condition, - Machine=Machine, # for typing + BaseMachine=BaseMachine, # for typing t=AssertionTester(), debug=self.debug, ) @@ -186,14 +192,14 @@ class Driver: def dump_machine_ssh(self, offset: int) -> None: print("SSH backdoor enabled, the machines can be accessed like this:") print( - f"{Style.BRIGHT}Note:{Style.RESET_ALL} this requires {Style.BRIGHT}systemd-ssh-proxy(1){Style.RESET_ALL} to be enabled (default on NixOS 25.05 and newer)." + f"{Style.BRIGHT}Note:{Style.RESET_ALL} vsocks require {Style.BRIGHT}systemd-ssh-proxy(1){Style.RESET_ALL} to be enabled (default on NixOS 25.05 and newer)." ) - names = [machine.name for machine in self.machines] - longest_name = len(max(names, key=len)) - for num, name in enumerate(names, start=offset + 1): + longest_name = len(max((machine.name for machine in self.machines), key=len)) + for index, machine in enumerate(self.machines, start=offset + 1): + name = machine.name spaces = " " * (longest_name - len(name) + 2) print( - f" {name}:{spaces}{Style.BRIGHT}ssh -o User=root vsock/{num}{Style.RESET_ALL}" + f" {name}:{spaces}{Style.BRIGHT}{machine.ssh_backdoor_command(index)}{Style.RESET_ALL}" ) def test_script(self) -> None: @@ -280,16 +286,13 @@ class Driver: *, name: str | None = None, keep_vm_state: bool = False, - ) -> Machine: + ) -> QemuMachine: tmp_dir = get_tmp_dir() - cmd = NixStartScript(start_command) - name = name or cmd.machine_name - - return Machine( + return QemuMachine( + start_command=start_command, tmp_dir=tmp_dir, out_dir=self.out_dir, - start_command=cmd, name=name, keep_vm_state=keep_vm_state, logger=self.logger, diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index f722d36ae40e..fecdeda493b5 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -13,6 +13,7 @@ import sys import tempfile import threading import time +from abc import ABC, abstractmethod from collections.abc import Callable, Generator from contextlib import _GeneratorContextManager, contextmanager, nullcontext from pathlib import Path @@ -114,15 +115,30 @@ def retry(fn: Callable, timeout_seconds: int = 900) -> None: ) -class StartCommand: - """The Base Start Command knows how to append the necessary +class QemuStartCommand: + """This class knows how to append the necessary runtime qemu options as determined by a particular test driver - run. Any such start command is expected to happily receive and - append additional qemu args. + run. """ _cmd: str + def __init__(self, script: str): + self._cmd = script + + @property + def machine_name(self) -> str: + """A start script from nixos/modules/virtualiation/qemu-vm.nix. + These Nix commands have the particular characteristic that the + machine name can be extracted out of them via a regex match. + (Admittedly a _very_ implicit contract, evtl. TODO fix) + """ + match = re.search("run-(.+)-vm$", self._cmd) + name = "machine" + if match: + name = match.group(1) + return name + def cmd( self, monitor_socket_path: Path, @@ -198,103 +214,42 @@ class StartCommand: ) -class NixStartScript(StartCommand): - """A start script from nixos/modules/virtualiation/qemu-vm.nix. - These Nix commands have the particular characteristic that the - machine name can be extracted out of them via a regex match. - (Admittedly a _very_ implicit contract, evtl. TODO fix) - """ - - def __init__(self, script: str): - self._cmd = script - - @property - def machine_name(self) -> str: - match = re.search("run-(.+)-vm$", self._cmd) - name = "machine" - if match: - name = match.group(1) - return name - - -class Machine: - """A handle to the machine with this name, that also knows how to manage - the machine lifecycle with the help of a start script / command.""" - +class BaseMachine(ABC): name: str - out_dir: Path + callbacks: list[Callable] tmp_dir: Path - shared_dir: Path - state_dir: Path - monitor_path: Path - qmp_path: Path - shell_path: Path - start_command: StartCommand keep_vm_state: bool - process: subprocess.Popen | None - pid: int | None - monitor: socket.socket | None - qmp_client: QMPSession | None - shell: socket.socket | None - serial_thread: threading.Thread | None - - booted: bool - connected: bool - # Store last serial console lines for use - # of wait_for_console_text - last_lines: Queue = Queue() - # Store all console output for full log retrieval - full_console_log: list[str] - callbacks: list[Callable] - def __repr__(self) -> str: - return f"" + return f"<{self.__class__.__name__} '{self.name}'>" def __init__( self, out_dir: Path, - tmp_dir: Path, - start_command: StartCommand, + name: str, logger: AbstractLogger, - name: str = "machine", - keep_vm_state: bool = False, - callbacks: list[Callable] | None = None, + tmp_dir: Path, + callbacks: list[Callable] | None, + keep_vm_state: bool, ) -> None: self.out_dir = out_dir - self.tmp_dir = tmp_dir - self.keep_vm_state = keep_vm_state self.name = name - self.start_command = start_command - self.callbacks = callbacks if callbacks is not None else [] self.logger = logger - self.full_console_log = [] + self.callbacks = callbacks if callbacks is not None else [] + self.tmp_dir = tmp_dir - # set up directories - self.shared_dir = self.tmp_dir / "shared-xchg" - self.shared_dir.mkdir(mode=0o700, exist_ok=True) + # Note: "vm" is a bit of a misnomer here. + # Consider renaming to something more generic ("machine"?) + self.keep_vm_state = keep_vm_state self.state_dir = self.tmp_dir / f"vm-state-{self.name}" - self.monitor_path = self.state_dir / "monitor" - self.qmp_path = self.state_dir / "qmp" - self.shell_path = self.state_dir / "shell" if (not self.keep_vm_state) and self.state_dir.exists(): self.cleanup_statedir() self.state_dir.mkdir(mode=0o700, exist_ok=True) - self.process = None - self.pid = None - self.monitor = None - self.qmp_client = None - self.shell = None - self.serial_thread = None - - self.booted = False - self.connected = False - - def is_up(self) -> bool: - return self.booted and self.connected + self.shared_dir = self.tmp_dir / "shared-xchg" + self.shared_dir.mkdir(mode=0o700, exist_ok=True) def log(self, msg: str) -> None: """ @@ -313,28 +268,50 @@ class Machine: my_attrs.update(attrs) return self.logger.nested(msg, my_attrs) - def wait_for_monitor_prompt(self) -> str: - assert self.monitor is not None - answer = "" - while True: - undecoded_answer = self.monitor.recv(1024) - if not undecoded_answer: - break - answer += undecoded_answer.decode() - if answer.endswith("(qemu) "): - break - return answer + @abstractmethod + def is_up(self) -> bool: + """ + Check whether the machine is running. + """ + pass - def send_monitor_command(self, command: str) -> str: + @abstractmethod + def start(self) -> None: """ - Send a command to the QEMU monitor. This allows attaching - virtual USB disks to a running machine, among other things. + Start the machine. """ - self.run_callbacks() - message = f"{command}\n".encode() - assert self.monitor is not None - self.monitor.send(message) - return self.wait_for_monitor_prompt() + pass + + @abstractmethod + def wait_for_shutdown(self) -> None: + """ + Wait for the machine to power off. This does *not* initiate a shutdown; + that's usually done via `shutdown()`. + """ + pass + + def systemctl(self, q: str, user: str | None = None) -> tuple[int, str]: + """ + Runs `systemctl` commands with optional support for + `systemctl --user` + + ```py + # run `systemctl list-jobs --no-pager` + machine.systemctl("list-jobs --no-pager") + + # spawn a shell for `any-user` and run + # `systemctl --user list-jobs --no-pager` + machine.systemctl("list-jobs --no-pager", "any-user") + ``` + """ + if user is not None: + q = q.replace("'", "\\'") + return self.execute( + f"su -l {user} --shell /bin/sh -c " + "$'XDG_RUNTIME_DIR=/run/user/`id -u` " + f"systemctl --user {q}'" + ) + return self.execute(f"systemctl {q}") def wait_for_unit( self, unit: str, user: str | None = None, timeout: int = 900 @@ -424,29 +401,6 @@ class Machine: assert match[1] == property, invalid_output_message return match[2] - def systemctl(self, q: str, user: str | None = None) -> tuple[int, str]: - """ - Runs `systemctl` commands with optional support for - `systemctl --user` - - ```py - # run `systemctl list-jobs --no-pager` - machine.systemctl("list-jobs --no-pager") - - # spawn a shell for `any-user` and run - # `systemctl --user list-jobs --no-pager` - machine.systemctl("list-jobs --no-pager", "any-user") - ``` - """ - if user is not None: - q = q.replace("'", "\\'") - return self.execute( - f"su -l {user} --shell /bin/sh -c " - "$'XDG_RUNTIME_DIR=/run/user/`id -u` " - f"systemctl --user {q}'" - ) - return self.execute(f"systemctl {q}") - def require_unit_state(self, unit: str, require_state: str = "active") -> None: """ Assert that the current state of a unit has a specific value. The default state is "active". @@ -462,6 +416,386 @@ class Machine: f"'{require_state}' but it is in state '{state}'" ) + def succeed(self, *commands: str, timeout: int | None = None) -> str: + """ + Execute a shell command, raising an exception if the exit status is + not zero, otherwise returning the standard output. Similar to `execute`, + except that the timeout is `None` by default. See `execute` for details on + command execution. + """ + output = "" + for command in commands: + with self.nested(f"must succeed: {command}"): + (status, out) = self.execute(command, timeout=timeout) + if status != 0: + self.log(f"output: {out}") + raise RequestedAssertionFailed( + f"command `{command}` failed (exit code {status})" + ) + output += out + return output + + def fail(self, *commands: str, timeout: int | None = None) -> str: + """ + Like `succeed`, but raising an exception if the command returns a zero + status. + """ + output = "" + for command in commands: + with self.nested(f"must fail: {command}"): + (status, out) = self.execute(command, timeout=timeout) + if status == 0: + raise RequestedAssertionFailed( + f"command `{command}` unexpectedly succeeded" + ) + output += out + return output + + def wait_until_succeeds(self, command: str, timeout: int = 900) -> str: + """ + Repeat a shell command with 1-second intervals until it succeeds. + Has a default timeout of 900 seconds which can be modified, e.g. + `wait_until_succeeds(cmd, timeout=10)`. See `execute` for details on + command execution. + Throws an exception on timeout. + """ + output = "" + + def check_success(_last_try: bool) -> bool: + nonlocal output + status, output = self.execute(command, timeout=timeout) + return status == 0 + + with self.nested(f"waiting for success: {command}"): + retry(check_success, timeout) + return output + + def wait_until_fails(self, command: str, timeout: int = 900) -> str: + """ + Like `wait_until_succeeds`, but repeating the command until it fails. + """ + output = "" + + def check_failure(_last_try: bool) -> bool: + nonlocal output + status, output = self.execute(command, timeout=timeout) + return status != 0 + + with self.nested(f"waiting for failure: {command}"): + retry(check_failure, timeout) + return output + + def sleep(self, secs: int) -> None: + # We want to sleep in *guest* time, not *host* time. + self.succeed(f"sleep {secs}") + + def wait_for_file(self, filename: str, timeout: int = 900) -> None: + """ + Waits until the file exists in the machine's file system. + """ + + def check_file(_last_try: bool) -> bool: + status, _ = self.execute(f"test -e {filename}") + return status == 0 + + with self.nested(f"waiting for file '{filename}'"): + retry(check_file, timeout) + + def wait_for_open_port( + self, port: int, addr: str = "localhost", timeout: int = 900 + ) -> None: + """ + Wait until a process is listening on the given TCP port and IP address + (default `localhost`). + """ + + def port_is_open(_last_try: bool) -> bool: + status, _ = self.execute(f"nc -z {addr} {port}") + return status == 0 + + with self.nested(f"waiting for TCP port {port} on {addr}"): + retry(port_is_open, timeout) + + def wait_for_open_unix_socket( + self, addr: str, is_datagram: bool = False, timeout: int = 900 + ) -> None: + """ + Wait until a process is listening on the given UNIX-domain socket + (default to a UNIX-domain stream socket). + """ + + nc_flags = [ + "-z", + "-uU" if is_datagram else "-U", + ] + + def socket_is_open(_last_try: bool) -> bool: + status, _ = self.execute(f"nc {' '.join(nc_flags)} {addr}") + return status == 0 + + with self.nested( + f"waiting for UNIX-domain {'datagram' if is_datagram else 'stream'} on '{addr}'" + ): + retry(socket_is_open, timeout) + + def wait_for_closed_port( + self, port: int, addr: str = "localhost", timeout: int = 900 + ) -> None: + """ + Wait until nobody is listening on the given TCP port and IP address + (default `localhost`). + """ + + def port_is_closed(_last_try: bool) -> bool: + status, _ = self.execute(f"nc -z {addr} {port}") + return status != 0 + + with self.nested(f"waiting for TCP port {port} on {addr} to be closed"): + retry(port_is_closed, timeout) + + def start_job(self, jobname: str, user: str | None = None) -> tuple[int, str]: + """ + Start systemd service. + """ + return self.systemctl(f"start {jobname}", user) + + def stop_job(self, jobname: str, user: str | None = None) -> tuple[int, str]: + """ + Stop systemd service. + """ + return self.systemctl(f"stop {jobname}", user) + + def wait_for_job(self, jobname: str) -> None: + self.wait_for_unit(jobname) + + def get_tty_text(self, tty: str) -> str: + """ + Get the output printed to a given TTY. + """ + status, output = self.execute( + f"fold -w$(stty -F /dev/tty{tty} size | awk '{{print $2}}') /dev/vcs{tty}" + ) + return output + + def wait_until_tty_matches(self, tty: str, regexp: str, timeout: int = 900) -> None: + """Wait until the visible output on the chosen TTY matches regular + expression. Throws an exception on timeout. + """ + matcher = re.compile(regexp) + + def tty_matches(last_try: bool) -> bool: + text = self.get_tty_text(tty) + if last_try: + self.log( + f"Last chance to match /{regexp}/ on TTY{tty}, " + f"which currently contains: {text}" + ) + return len(matcher.findall(text)) > 0 + + with self.nested(f"waiting for {regexp} to appear on tty {tty}"): + retry(tty_matches, timeout) + + def dump_tty_contents(self, tty: str) -> None: + """Debugging: Dump the contents of the TTY""" + self.execute(f"fold -w 80 /dev/vcs{tty} | systemd-cat") + + def execute( + self, + command: str, + check_return: bool = True, + check_output: bool = True, + timeout: int | None = 900, + ) -> tuple[int, str]: + self.run_callbacks() + return self._execute( + command=command, + check_return=check_return, + check_output=check_output, + timeout=timeout, + ) + + @abstractmethod + def _execute( + self, + command: str, + check_return: bool = True, + check_output: bool = True, + timeout: int | None = 900, + ) -> tuple[int, str]: ... + + def run_callbacks(self) -> None: + for callback in self.callbacks: + callback() + + def cleanup_statedir(self) -> None: + shutil.rmtree(self.state_dir) + self.logger.log(f"deleting VM state directory {self.state_dir}") + self.logger.log("if you want to keep the VM state, pass --keep-vm-state") + + def copy_from_vm(self, source: str, target_dir: str = "") -> None: + """Copy a file from the VM (specified by an in-VM source path) to a path + relative to `$out`. The file is copied via the `shared_dir` shared among + all the VMs (using a temporary directory). + """ + # Compute the source, target, and intermediate shared file names + vm_src = Path(source) + with tempfile.TemporaryDirectory(dir=self.shared_dir) as shared_td: + shared_temp = Path(shared_td) + vm_shared_temp = Path("/tmp/shared") / shared_temp.name + vm_intermediate = vm_shared_temp / vm_src.name + intermediate = shared_temp / vm_src.name + # Copy the file to the shared directory inside VM + self.succeed(make_command(["mkdir", "-p", vm_shared_temp])) + self.succeed(make_command(["cp", "-r", vm_src, vm_intermediate])) + abs_target = self.out_dir / target_dir / vm_src.name + abs_target.parent.mkdir(exist_ok=True, parents=True) + # Copy the file from the shared directory outside VM + if intermediate.is_dir(): + shutil.copytree(intermediate, abs_target) + else: + shutil.copy(intermediate, abs_target) + + def copy_from_host_via_shell(self, source: str, target: str) -> None: + """Copy a file from the host into the guest by piping it over the + shell into the destination file. Works without host-guest shared folder. + Prefer copy_from_host for whenever possible. + """ + with open(source, "rb") as fh: + content_b64 = base64.b64encode(fh.read()).decode() + self.succeed( + f"mkdir -p $(dirname {target})", + f"echo -n {content_b64} | base64 -d > {target}", + ) + + def copy_from_host(self, source: str, target: str) -> None: + """ + Copies a file from host to machine, e.g., + `copy_from_host("myfile", "/etc/my/important/file")`. + + The first argument is the file on the host. Note that the "host" refers + to the environment in which the test driver runs, which is typically the + Nix build sandbox. + + The second argument is the location of the file on the machine that will + be written to. + + The file is copied via the `shared_dir` directory which is shared among + all the VMs (using a temporary directory). + The access rights bits will mimic the ones from the host file and + user:group will be root:root. + """ + host_src = Path(source) + vm_target = Path(target) + with tempfile.TemporaryDirectory(dir=self.shared_dir) as shared_td: + shared_temp = Path(shared_td) + host_intermediate = shared_temp / host_src.name + vm_shared_temp = Path("/tmp/shared") / shared_temp.name + vm_intermediate = vm_shared_temp / host_src.name + + self.succeed(make_command(["mkdir", "-p", vm_shared_temp])) + if host_src.is_dir(): + shutil.copytree(host_src, host_intermediate) + else: + shutil.copy(host_src, host_intermediate) + self.succeed(make_command(["mkdir", "-p", vm_target.parent])) + self.succeed(make_command(["cp", "-r", vm_intermediate, vm_target])) + + +class QemuMachine(BaseMachine): + """A handle to the machine with this name, that also knows how to manage + the machine lifecycle with the help of a start script / command.""" + + name: str + out_dir: Path + shared_dir: Path + state_dir: Path + monitor_path: Path + qmp_path: Path + shell_path: Path + + start_command: QemuStartCommand + + process: subprocess.Popen | None + pid: int | None + monitor: socket.socket | None + qmp_client: QMPSession | None + shell: socket.socket | None + serial_thread: threading.Thread | None + + booted: bool + connected: bool + # Store last serial console lines for use + # of wait_for_console_text + last_lines: Queue = Queue() + # Store all console output for full log retrieval + full_console_log: list[str] + + def __init__( + self, + out_dir: Path, + tmp_dir: Path, + start_command: str, + logger: AbstractLogger, + name: str | None = None, + keep_vm_state: bool = False, + callbacks: list[Callable] | None = None, + ) -> None: + self.start_command = QemuStartCommand(start_command) + super().__init__( + out_dir=out_dir, + name=name or self.start_command.machine_name, + logger=logger, + callbacks=callbacks, + tmp_dir=tmp_dir, + keep_vm_state=keep_vm_state, + ) + + self.full_console_log = [] + + # set up directories + self.monitor_path = self.state_dir / "monitor" + self.qmp_path = self.state_dir / "qmp" + self.shell_path = self.state_dir / "shell" + + self.process = None + self.pid = None + self.monitor = None + self.qmp_client = None + self.shell = None + self.serial_thread = None + + self.booted = False + self.connected = False + + def ssh_backdoor_command(self, index: int) -> str: + return f"ssh -o User=root vsock/{index}" + + def is_up(self) -> bool: + return self.booted and self.connected + + def wait_for_monitor_prompt(self) -> str: + assert self.monitor is not None + answer = "" + while True: + undecoded_answer = self.monitor.recv(1024) + if not undecoded_answer: + break + answer += undecoded_answer.decode() + if answer.endswith("(qemu) "): + break + return answer + + def send_monitor_command(self, command: str) -> str: + """ + Send a command to the QEMU monitor. This allows attaching + virtual USB disks to a running machine, among other things. + """ + self.run_callbacks() + message = f"{command}\n".encode() + assert self.monitor is not None + self.monitor.send(message) + return self.wait_for_monitor_prompt() + def _next_newline_closed_block_from_shell(self) -> str: assert self.shell output_buffer = [] @@ -478,7 +812,7 @@ class Machine: break return "".join(output_buffer) - def execute( + def _execute( self, command: str, check_return: bool = True, @@ -517,7 +851,6 @@ class Machine: `timeout` parameter, e.g., `execute(cmd, timeout=10)` or `execute(cmd, timeout=None)`. The default is 900 seconds. """ - self.run_callbacks() self.connect() # Always run command with shell opts @@ -598,75 +931,6 @@ class Machine: break self.send_console(char.decode()) - def succeed(self, *commands: str, timeout: int | None = None) -> str: - """ - Execute a shell command, raising an exception if the exit status is - not zero, otherwise returning the standard output. Similar to `execute`, - except that the timeout is `None` by default. See `execute` for details on - command execution. - """ - output = "" - for command in commands: - with self.nested(f"must succeed: {command}"): - (status, out) = self.execute(command, timeout=timeout) - if status != 0: - self.log(f"output: {out}") - raise RequestedAssertionFailed( - f"command `{command}` failed (exit code {status})" - ) - output += out - return output - - def fail(self, *commands: str, timeout: int | None = None) -> str: - """ - Like `succeed`, but raising an exception if the command returns a zero - status. - """ - output = "" - for command in commands: - with self.nested(f"must fail: {command}"): - (status, out) = self.execute(command, timeout=timeout) - if status == 0: - raise RequestedAssertionFailed( - f"command `{command}` unexpectedly succeeded" - ) - output += out - return output - - def wait_until_succeeds(self, command: str, timeout: int = 900) -> str: - """ - Repeat a shell command with 1-second intervals until it succeeds. - Has a default timeout of 900 seconds which can be modified, e.g. - `wait_until_succeeds(cmd, timeout=10)`. See `execute` for details on - command execution. - Throws an exception on timeout. - """ - output = "" - - def check_success(_last_try: bool) -> bool: - nonlocal output - status, output = self.execute(command, timeout=timeout) - return status == 0 - - with self.nested(f"waiting for success: {command}"): - retry(check_success, timeout) - return output - - def wait_until_fails(self, command: str, timeout: int = 900) -> str: - """ - Like `wait_until_succeeds`, but repeating the command until it fails. - """ - output = "" - - def check_failure(_last_try: bool) -> bool: - nonlocal output - status, output = self.execute(command, timeout=timeout) - return status != 0 - - with self.nested(f"waiting for failure: {command}"): - retry(check_failure, timeout) - return output - def wait_for_shutdown(self) -> None: """ Wait for the VM to power off. This does *not* initiate a shutdown; @@ -710,33 +974,6 @@ class Machine: if elapsed >= timeout: raise TimeoutError - def get_tty_text(self, tty: str) -> str: - """ - Get the output printed to a given TTY. - """ - status, output = self.execute( - f"fold -w$(stty -F /dev/tty{tty} size | awk '{{print $2}}') /dev/vcs{tty}" - ) - return output - - def wait_until_tty_matches(self, tty: str, regexp: str, timeout: int = 900) -> None: - """Wait until the visible output on the chosen TTY matches regular - expression. Throws an exception on timeout. - """ - matcher = re.compile(regexp) - - def tty_matches(last_try: bool) -> bool: - text = self.get_tty_text(tty) - if last_try: - self.log( - f"Last chance to match /{regexp}/ on TTY{tty}, " - f"which currently contains: {text}" - ) - return len(matcher.findall(text)) > 0 - - with self.nested(f"waiting for {regexp} to appear on tty {tty}"): - retry(tty_matches, timeout) - def send_chars(self, chars: str, delay: float | None = 0.01) -> None: r""" Simulate typing a sequence of characters on the virtual keyboard, @@ -759,70 +996,6 @@ class Machine: with self.nested(f"waiting for file '{filename}'"): retry(check_file, timeout) - def wait_for_open_port( - self, port: int, addr: str = "localhost", timeout: int = 900 - ) -> None: - """ - Wait until a process is listening on the given TCP port and IP address - (default `localhost`). - """ - - def port_is_open(_last_try: bool) -> bool: - status, _ = self.execute(f"nc -z {addr} {port}") - return status == 0 - - with self.nested(f"waiting for TCP port {port} on {addr}"): - retry(port_is_open, timeout) - - def wait_for_open_unix_socket( - self, addr: str, is_datagram: bool = False, timeout: int = 900 - ) -> None: - """ - Wait until a process is listening on the given UNIX-domain socket - (default to a UNIX-domain stream socket). - """ - - nc_flags = [ - "-z", - "-uU" if is_datagram else "-U", - ] - - def socket_is_open(_last_try: bool) -> bool: - status, _ = self.execute(f"nc {' '.join(nc_flags)} {addr}") - return status == 0 - - with self.nested( - f"waiting for UNIX-domain {'datagram' if is_datagram else 'stream'} on '{addr}'" - ): - retry(socket_is_open, timeout) - - def wait_for_closed_port( - self, port: int, addr: str = "localhost", timeout: int = 900 - ) -> None: - """ - Wait until nobody is listening on the given TCP port and IP address - (default `localhost`). - """ - - def port_is_closed(_last_try: bool) -> bool: - status, _ = self.execute(f"nc -z {addr} {port}") - return status != 0 - - with self.nested(f"waiting for TCP port {port} on {addr} to be closed"): - retry(port_is_closed, timeout) - - def start_job(self, jobname: str, user: str | None = None) -> tuple[int, str]: - """ - Start systemd service. - """ - return self.systemctl(f"start {jobname}", user) - - def stop_job(self, jobname: str, user: str | None = None) -> tuple[int, str]: - """ - Stop systemd service. - """ - return self.systemctl(f"stop {jobname}", user) - def connect(self) -> None: """ Wait for a connection to the guest root shell @@ -902,78 +1075,6 @@ class Machine: f"Cannot convert screenshot (pnmtopng returned code {ret.returncode})" ) - def copy_from_host_via_shell(self, source: str, target: str) -> None: - """Copy a file from the host into the guest by piping it over the - shell into the destination file. Works without host-guest shared folder. - Prefer copy_from_host for whenever possible. - """ - with open(source, "rb") as fh: - content_b64 = base64.b64encode(fh.read()).decode() - self.succeed( - f"mkdir -p $(dirname {target})", - f"echo -n {content_b64} | base64 -d > {target}", - ) - - def copy_from_host(self, source: str, target: str) -> None: - """ - Copies a file from host to machine, e.g., - `copy_from_host("myfile", "/etc/my/important/file")`. - - The first argument is the file on the host. Note that the "host" refers - to the environment in which the test driver runs, which is typically the - Nix build sandbox. - - The second argument is the location of the file on the machine that will - be written to. - - The file is copied via the `shared_dir` directory which is shared among - all the VMs (using a temporary directory). - The access rights bits will mimic the ones from the host file and - user:group will be root:root. - """ - host_src = Path(source) - vm_target = Path(target) - with tempfile.TemporaryDirectory(dir=self.shared_dir) as shared_td: - shared_temp = Path(shared_td) - host_intermediate = shared_temp / host_src.name - vm_shared_temp = Path("/tmp/shared") / shared_temp.name - vm_intermediate = vm_shared_temp / host_src.name - - self.succeed(make_command(["mkdir", "-p", vm_shared_temp])) - if host_src.is_dir(): - shutil.copytree(host_src, host_intermediate) - else: - shutil.copy(host_src, host_intermediate) - self.succeed(make_command(["mkdir", "-p", vm_target.parent])) - self.succeed(make_command(["cp", "-r", vm_intermediate, vm_target])) - - def copy_from_vm(self, source: str, target_dir: str = "") -> None: - """Copy a file from the VM (specified by an in-VM source path) to a path - relative to `$out`. The file is copied via the `shared_dir` shared among - all the VMs (using a temporary directory). - """ - # Compute the source, target, and intermediate shared file names - vm_src = Path(source) - with tempfile.TemporaryDirectory(dir=self.shared_dir) as shared_td: - shared_temp = Path(shared_td) - vm_shared_temp = Path("/tmp/shared") / shared_temp.name - vm_intermediate = vm_shared_temp / vm_src.name - intermediate = shared_temp / vm_src.name - # Copy the file to the shared directory inside VM - self.succeed(make_command(["mkdir", "-p", vm_shared_temp])) - self.succeed(make_command(["cp", "-r", vm_src, vm_intermediate])) - abs_target = self.out_dir / target_dir / vm_src.name - abs_target.parent.mkdir(exist_ok=True, parents=True) - # Copy the file from the shared directory outside VM - if intermediate.is_dir(): - shutil.copytree(intermediate, abs_target) - else: - shutil.copy(intermediate, abs_target) - - def dump_tty_contents(self, tty: str) -> None: - """Debugging: Dump the contents of the TTY""" - self.execute(f"fold -w 80 /dev/vcs{tty} | systemd-cat") - def get_screen_text_variants(self) -> list[str]: """ Return a list of different interpretations of what is currently @@ -1154,11 +1255,6 @@ class Machine: self.log(f"QEMU running (pid {self.pid})") - def cleanup_statedir(self) -> None: - shutil.rmtree(self.state_dir) - self.logger.log(f"deleting VM state directory {self.state_dir}") - self.logger.log("if you want to keep the VM state, pass --keep-vm-state") - def shutdown(self) -> None: """ Shut down the machine, waiting for the VM to exit. @@ -1234,10 +1330,6 @@ class Machine: with self.nested("waiting for a window to appear"): retry(window_is_visible, timeout) - def sleep(self, secs: int) -> None: - # We want to sleep in *guest* time, not *host* time. - self.succeed(f"sleep {secs}") - def forward_port(self, host_port: int = 8080, guest_port: int = 80) -> None: """ Forward a TCP port on the host to a TCP port on the guest. @@ -1264,7 +1356,7 @@ class Machine: def release(self) -> None: if self.pid is None: return - self.logger.info(f"kill machine (pid {self.pid})") + self.logger.info(f"kill QemuMachine (pid {self.pid})") assert self.process assert self.shell assert self.monitor @@ -1278,10 +1370,6 @@ class Machine: if self.qmp_client: self.qmp_client.close() - def run_callbacks(self) -> None: - for callback in self.callbacks: - callback() - def switch_root(self) -> None: """ Transition from stage 1 to stage 2. This requires the diff --git a/nixos/lib/test-script-prepend.py b/nixos/lib/test-script-prepend.py index 6be20270c6cc..84a689f93d16 100644 --- a/nixos/lib/test-script-prepend.py +++ b/nixos/lib/test-script-prepend.py @@ -4,7 +4,7 @@ from test_driver.debug import DebugAbstract from test_driver.driver import Driver from test_driver.vlan import VLan -from test_driver.machine import Machine +from test_driver.machine import BaseMachine, QemuMachine from test_driver.logger import AbstractLogger from typing import Callable, Iterator, ContextManager, Optional, List, Dict, Any, Union from typing_extensions import Protocol @@ -35,7 +35,7 @@ class CreateMachineProtocol(Protocol): *, name: Optional[str] = None, keep_vm_state: bool = False, - ) -> Machine: + ) -> BaseMachine: raise Exception("This is just type information for the Nix test driver") @@ -43,7 +43,7 @@ start_all: Callable[[], None] subtest: Callable[[str], ContextManager[None]] retry: RetryProtocol test_script: Callable[[], None] -machines: List[Machine] +machines: List[BaseMachine] vlans: List[VLan] driver: Driver log: AbstractLogger diff --git a/nixos/lib/testing/driver.nix b/nixos/lib/testing/driver.nix index 5845ebe2695a..5623c63c4dc5 100644 --- a/nixos/lib/testing/driver.nix +++ b/nixos/lib/testing/driver.nix @@ -21,12 +21,6 @@ let ) (lib.attrValues config.nodes); vms = map (m: m.system.build.vm) (lib.attrValues config.nodes); - nodeHostNames = - let - nodesList = map (c: c.system.name) (lib.attrValues config.nodes); - in - nodesList ++ lib.optional (lib.length nodesList == 1 && !lib.elem "machine" nodesList) "machine"; - pythonizeName = name: let @@ -38,8 +32,17 @@ let uniqueVlans = lib.unique (builtins.concatLists vlans); vlanNames = map (i: "vlan${toString i}: VLan;") uniqueVlans; - pythonizedNames = map pythonizeName nodeHostNames; - machineNames = map (name: "${name}: Machine;") pythonizedNames; + + vmMachineNames = map (c: c.system.name) (lib.attrValues config.nodes); + + theOnlyMachine = + let + exactlyOneMachine = lib.length (lib.attrValues config.nodes) == 1; + in + lib.optional (exactlyOneMachine && !lib.elem "machine" vmMachineNames) "machine"; + + pythonizedVmNames = map pythonizeName (vmMachineNames ++ theOnlyMachine); + vmMachineTypeHints = map (name: "${name}: QemuMachine;") pythonizedVmNames; withChecks = lib.warnIf config.skipLint "Linting is disabled"; @@ -62,12 +65,13 @@ let '' mkdir -p $out/bin - vmStartScripts=($(for i in ${toString vms}; do echo $i/bin/run-*-vm; done)) + vmNames=(${lib.escapeShellArgs vmMachineNames}) + vmStartScripts=(${lib.escapeShellArgs (map lib.getExe vms)}) ${lib.optionalString (!config.skipTypeCheck) '' # prepend type hints so the test script can be type checked with mypy cat "${../test-script-prepend.py}" >> testScriptWithTypes - echo "${toString machineNames}" >> testScriptWithTypes + echo "${toString vmMachineTypeHints}" >> testScriptWithTypes echo "${toString vlanNames}" >> testScriptWithTypes echo -n "$testScript" >> testScriptWithTypes @@ -90,7 +94,7 @@ let echo "See https://nixos.org/manual/nixos/stable/#test-opt-skipLint" PYFLAKES_BUILTINS="$( - echo -n ${lib.escapeShellArg (lib.concatStringsSep "," pythonizedNames)}, + echo -n ${lib.escapeShellArg (lib.concatStringsSep "," pythonizedVmNames)}, cat ${lib.escapeShellArg "driver-symbols"} )" ${hostPkgs.python3Packages.pyflakes}/bin/pyflakes $out/test-script ''} @@ -98,7 +102,8 @@ let # set defaults through environment # see: ./test-driver/test-driver.py argparse implementation wrapProgram $out/bin/nixos-test-driver \ - --set startScripts "''${vmStartScripts[*]}" \ + --set vmStartScripts "''${vmStartScripts[*]}" \ + --set vmNames "''${vmNames[*]}" \ --set testScript "$out/test-script" \ --set globalTimeout "${toString config.globalTimeout}" \ --set vlans '${toString vlans}' \ From 23f1e6370d1edf6b3b44bbe5f244b878c02c64b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 8 Jan 2026 12:22:07 +0100 Subject: [PATCH 05/37] nixos/test-driver: add support for nspawn containers Co-authored-by: Jeremy Fleischman --- nixos/lib/test-driver/default.nix | 4 + .../test-driver/src/test_driver/__init__.py | 24 +++ .../lib/test-driver/src/test_driver/driver.py | 90 +++++++- .../src/test_driver/machine/__init__.py | 195 ++++++++++++++++-- nixos/lib/test-script-prepend.py | 2 +- nixos/lib/testing-python.nix | 1 + nixos/lib/testing/driver.nix | 28 ++- nixos/lib/testing/network.nix | 87 +++++--- nixos/lib/testing/nixos-test-base.nix | 5 +- nixos/lib/testing/nodes.nix | 153 ++++++++++---- nixos/lib/testing/run.nix | 3 + nixos/lib/testing/testScript.nix | 3 +- .../modules/testing/test-instrumentation.nix | 18 +- 13 files changed, 508 insertions(+), 105 deletions(-) diff --git a/nixos/lib/test-driver/default.nix b/nixos/lib/test-driver/default.nix index ed8eb2c8c771..13158d65de6b 100644 --- a/nixos/lib/test-driver/default.nix +++ b/nixos/lib/test-driver/default.nix @@ -19,7 +19,9 @@ qemu_test, setuptools, socat, + systemd, tesseract4, + util-linux, vde2, enableOCR ? false, @@ -51,7 +53,9 @@ buildPythonApplication { netpbm qemu_pkg socat + util-linux vde2 + systemd ] ++ lib.optionals enableOCR [ imagemagick_light diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 35c5e3b11fb6..9c465af778c7 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -86,6 +86,22 @@ def main() -> None: nargs="*", help="start scripts for participating virtual machines", ) + arg_parser.add_argument( + "--container-names", + metavar="CONTAINER-NAME", + action=EnvDefault, + envvar="containerNames", + nargs="*", + help="names of participating containers", + ) + arg_parser.add_argument( + "--container-start-scripts", + metavar="CONTAINER-START-SCRIPT", + action=EnvDefault, + envvar="containerStartScripts", + nargs="*", + help="start scripts for participating containers", + ) arg_parser.add_argument( "--vlans", metavar="VLAN", @@ -150,10 +166,16 @@ def main() -> None: assert len(args.vm_names) == len(args.vm_start_scripts), ( f"the number of vm names and vm start scripts must be the same: {args.vm_names} vs. {args.vm_start_scripts}" ) + if args.container_names is not None and args.container_start_scripts is not None: + assert len(args.container_names) == len(args.container_start_scripts), ( + f"the number of container names and container start scripts must be the same: {args.container_names} vs. {args.container_start_scripts}" + ) with Driver( vm_names=args.vm_names, vm_start_scripts=args.vm_start_scripts or [], + container_names=args.container_names, + container_start_scripts=args.container_start_scripts or [], vlans=args.vlans, tests=args.testscript.read_text(), out_dir=output_directory, @@ -187,6 +209,8 @@ def generate_driver_symbols() -> None: d = Driver( vm_names=[], vm_start_scripts=[], + container_names=[], + container_start_scripts=[], vlans=[], tests="", out_dir=Path(), diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index c8694d2ea09a..cc04d6b5b707 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -1,6 +1,7 @@ import os import re import signal +import subprocess import sys import tempfile import threading @@ -16,7 +17,12 @@ from colorama import Style from test_driver.debug import DebugAbstract, DebugNop from test_driver.errors import MachineError, RequestedAssertionFailed from test_driver.logger import AbstractLogger -from test_driver.machine import BaseMachine, QemuMachine, retry +from test_driver.machine import ( + BaseMachine, + NspawnMachine, + QemuMachine, + retry, +) from test_driver.polling_condition import PollingCondition from test_driver.vlan import VLan @@ -64,6 +70,7 @@ class Driver: tests: str vlans: list[VLan] vm_machines: list[QemuMachine] + container_machines: list[NspawnMachine] polling_conditions: list[PollingCondition] global_timeout: int race_timer: threading.Timer @@ -74,6 +81,8 @@ class Driver: self, vm_names: list[str] | None, vm_start_scripts: list[str], + container_names: list[str] | None, + container_start_scripts: list[str], vlans: list[int], tests: str, out_dir: Path, @@ -112,10 +121,75 @@ class Driver: ) ] + if len(container_start_scripts) > 0: + self._init_nspawn_environment() + + self.container_machines = [ + NspawnMachine( + name=name, + start_command=container_start_script, + tmp_dir=tmp_dir, + logger=self.logger, + keep_vm_state=keep_vm_state, + callbacks=[self.check_polling_conditions], + out_dir=self.out_dir, + ) + for name, container_start_script in zip( + container_names or (len(container_start_scripts) * [None]), + container_start_scripts, + ) + ] + + def _init_nspawn_environment(self) -> None: + assert os.geteuid() == 0, ( + f"systemd-nspawn requires root to work. You are {os.geteuid()}" + ) + + # set up prerequisites for systemd-nspawn containers. + # these are not guaranteed to be set up in the Nix sandbox. + # if running interactively as root, these will already be set up. + + # check if /run is writable by root + if not os.access("/run", os.W_OK): + Path("/run").mkdir(parents=True, exist_ok=True) + subprocess.run(["mount", "-t", "tmpfs", "none", "/run"], check=True) + Path("/run/netns").mkdir(parents=True, exist_ok=True) + + # check if /var/run is a symlink to /run + if not (os.path.exists("/var/run") and os.path.samefile("/var/run", "/run")): + Path("/var").mkdir(parents=True, exist_ok=True) + subprocess.run(["ln", "-s", "/run", "/var/run"], check=True) + + # check if /sys/fs/cgroup is mounted as cgroup2 + with open("/proc/mounts", encoding="utf-8") as mounts: + for line in mounts: + parts = line.split() + if len(parts) >= 3 and parts[1] == "/sys/fs/cgroup": + if parts[2] == "cgroup2": + break + else: + Path("/sys/fs/cgroup").mkdir(parents=True, exist_ok=True) + subprocess.run( + ["mount", "-t", "cgroup2", "none", "/sys/fs/cgroup"], check=True + ) + + # ensure /etc/os-release exists + if not os.path.isfile("/etc/os-release"): + subprocess.run(["touch", "/etc/os-release"], check=True) + + # ensure /etc/machine-id exists and is non-empty + if ( + not os.path.isfile("/etc/machine-id") + or os.path.getsize("/etc/machine-id") == 0 + ): + subprocess.run( + ["systemd-machine-id-setup"], check=True + ) # set up /etc/machine-id + @property - def machines(self) -> list[QemuMachine]: - machines = self.vm_machines - # Sort the machines by name for consistency with `nodes` in . + def machines(self) -> list[QemuMachine | NspawnMachine]: + machines = self.vm_machines + self.container_machines + # Sort the machines by name for consistency with `nodesAndContainers` in . machines.sort(key=lambda machine: machine.name) return machines @@ -155,6 +229,7 @@ class Driver: start_all=self.start_all, test_script=self.test_script, vm_machines=self.vm_machines, + container_machines=self.container_machines, vlans=self.vlans, driver=self, log=self.logger, @@ -286,13 +361,16 @@ class Driver: *, name: str | None = None, keep_vm_state: bool = False, - ) -> QemuMachine: + ) -> BaseMachine: + """ + Create a `QemuMachine`. This currently only supports qemu "nodes", not containers. + """ tmp_dir = get_tmp_dir() return QemuMachine( - start_command=start_command, tmp_dir=tmp_dir, out_dir=self.out_dir, + start_command=start_command, name=name, keep_vm_state=keep_vm_state, logger=self.logger, diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index fecdeda493b5..32d5bc79166e 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1,5 +1,6 @@ import base64 import io +import json import os import platform import queue @@ -16,6 +17,7 @@ import time from abc import ABC, abstractmethod from collections.abc import Callable, Generator from contextlib import _GeneratorContextManager, contextmanager, nullcontext +from functools import cached_property from pathlib import Path from queue import Queue from typing import Any @@ -218,7 +220,6 @@ class BaseMachine(ABC): name: str callbacks: list[Callable] tmp_dir: Path - keep_vm_state: bool def __repr__(self) -> str: @@ -239,7 +240,7 @@ class BaseMachine(ABC): self.callbacks = callbacks if callbacks is not None else [] self.tmp_dir = tmp_dir - # Note: "vm" is a bit of a misnomer here. + # Note: "vm" is a bit of a misnomer here as we support both QEMU vms and nspawn containers. # Consider renaming to something more generic ("machine"?) self.keep_vm_state = keep_vm_state @@ -269,26 +270,13 @@ class BaseMachine(ABC): return self.logger.nested(msg, my_attrs) @abstractmethod - def is_up(self) -> bool: - """ - Check whether the machine is running. - """ - pass + def is_up(self) -> bool: ... @abstractmethod - def start(self) -> None: - """ - Start the machine. - """ - pass + def start(self) -> None: ... @abstractmethod - def wait_for_shutdown(self) -> None: - """ - Wait for the machine to power off. This does *not* initiate a shutdown; - that's usually done via `shutdown()`. - """ - pass + def wait_for_shutdown(self) -> None: ... def systemctl(self, q: str, user: str | None = None) -> tuple[int, str]: """ @@ -1384,3 +1372,174 @@ class QemuMachine(BaseMachine): ) self.connected = False self.connect() + + +class NspawnMachine(BaseMachine): + """ + A handle to a systemd-nspawn container machine with this name, that also + knows how to manage the machine lifecycle with the help of a start script / command. + """ + + start_command: str + tmp_dir: Path + process: subprocess.Popen | None + pid: int | None + + @staticmethod + def machine_name_from_start_command(start_command: str) -> str: + match = re.search("run-(.+)-nspawn", os.path.basename(start_command)) + assert match is not None, f"Could not extract node name from {start_command}" + return match.group(1) + + def __init__( + self, + out_dir: Path, + name: str | None, + start_command: str, + tmp_dir: Path, + logger: AbstractLogger, + callbacks: list[Callable] | None = None, + keep_vm_state: bool = False, + ): + # TODO: don't compute `name` from `start_command` path, instead thread it down explicitly. + # See analogous TODO in `QemuStartCommand::machine_name`. + super().__init__( + out_dir=out_dir, + name=name or self.machine_name_from_start_command(start_command), + logger=logger, + callbacks=callbacks, + tmp_dir=tmp_dir, + keep_vm_state=keep_vm_state, + ) + + self.start_command = start_command + self.process = None + self.pid = None + + def ssh_backdoor_command(self, index: int) -> str: + # get IP from `ip addr` inside the container: + ip_status, ip_output = self._execute("ip -j addr show") + assert ip_status == 0, "Failed to get IP addresses from container" + ip_output_data = json.loads(ip_output) + ip_addresses = [ + addr_info.get("local") + for iface in ip_output_data + if iface.get("ifname") != "lo" + for addr_info in iface.get("addr_info", []) + if addr_info.get("family") == "inet" + ] + + return "\n".join(f"ssh -o User=root {addr}" for addr in ip_addresses) + + def release(self) -> None: + if self.pid is None: + return + + self.logger.info(f"kill NspawnMachine (pid {self.pid})") + assert self.process is not None + self.process.terminate() + self.process = None + + def is_up(self) -> bool: + return self.process is not None + + @cached_property + def get_systemd_process(self) -> int: + assert self.process is not None, "Machine not started" + assert self.process.stdout is not None, "Machine has no stdout" + + systemd_nspawn_pid = None + for line_bytes in self.process.stdout: + line = line_bytes.decode() + print(line, end="") + + systemd_nspawn_pid_prefix = "systemd-nspawn's PID is " + if line.startswith(systemd_nspawn_pid_prefix): + systemd_nspawn_pid = int(line.removeprefix(systemd_nspawn_pid_prefix)) + + if ( + line.startswith("systemd[1]: Startup finished in") + or "Welcome to NixOS" in line + ): + assert systemd_nspawn_pid is not None, "Must find systemd-nspawn PID" + break + else: + raise RuntimeError(f"Failed to start container {self.name}") + + childs = ( + Path(f"/proc/{systemd_nspawn_pid}/task/{systemd_nspawn_pid}/children") + .read_text() + .split() + ) + assert len(childs) == 1, ( + f"Expected exactly one child process for systemd-nspawn, got {childs}" + ) + (child,) = childs + + try: + return int(child) + except ValueError as e: + raise RuntimeError(f"Failed to parse child process id {child}") from e + + def _execute( + self, + command: str, + check_return: bool = True, + check_output: bool = True, + timeout: int | None = 900, + ) -> tuple[int, str]: + self.start() + + container_pid = self.get_systemd_process + nsenter = shutil.which("nsenter") + assert nsenter is not None + + # Pull in /etc/profile, and some shell sanity. + command = f"set -eo pipefail; source /etc/profile; set -xu; {command}" + cp = subprocess.run( + [ + nsenter, + "--target", + str(container_pid), + "--mount", + "--uts", + "--ipc", + "--net", + "--pid", + "--cgroup", + "/bin/sh", + "-c", + command, + ], + env={}, + timeout=timeout, + stdout=subprocess.PIPE, + text=True, + ) + return (cp.returncode, cp.stdout) + + def start(self) -> None: + if self.process is not None: + return + + self.process = subprocess.Popen( + [self.start_command], + env={ + "RUN_NSPAWN_ROOT_DIR": str(self.state_dir), + "RUN_NSPAWN_SHARED_DIR": str(self.shared_dir), + }, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + + self.pid = self.process.pid + + self.log(f"system-nspawn running (pid {self.pid})") + + def wait_for_shutdown(self) -> None: + if self.process is None: + return + + with self.nested("waiting for the container to power off"): + self.process.wait() + self.process = None diff --git a/nixos/lib/test-script-prepend.py b/nixos/lib/test-script-prepend.py index 84a689f93d16..e06a210d6dd7 100644 --- a/nixos/lib/test-script-prepend.py +++ b/nixos/lib/test-script-prepend.py @@ -4,7 +4,7 @@ from test_driver.debug import DebugAbstract from test_driver.driver import Driver from test_driver.vlan import VLan -from test_driver.machine import BaseMachine, QemuMachine +from test_driver.machine import BaseMachine, NspawnMachine, QemuMachine from test_driver.logger import AbstractLogger from typing import Callable, Iterator, ContextManager, Optional, List, Dict, Any, Union from typing_extensions import Protocol diff --git a/nixos/lib/testing-python.nix b/nixos/lib/testing-python.nix index 878f9669321a..1d2777ecb033 100644 --- a/nixos/lib/testing-python.nix +++ b/nixos/lib/testing-python.nix @@ -56,6 +56,7 @@ pkgs.lib.throwIf (args ? specialArgs) { machine ? null, nodes ? { }, + containers ? { }, testScript, enableOCR ? false, globalTimeout ? (60 * 60), diff --git a/nixos/lib/testing/driver.nix b/nixos/lib/testing/driver.nix index 5623c63c4dc5..db56f55307c8 100644 --- a/nixos/lib/testing/driver.nix +++ b/nixos/lib/testing/driver.nix @@ -14,12 +14,15 @@ let qemu_pkg = config.qemu.package; imagemagick_light = hostPkgs.imagemagick_light.override { inherit (hostPkgs) libtiff; }; tesseract4 = hostPkgs.tesseract4.override { enableLanguages = [ "eng" ]; }; + # We want `pkgs.systemd`, *not* `python3Packages.system`. + systemd = hostPkgs.systemd; }; vlans = map ( m: (m.virtualisation.vlans ++ (lib.mapAttrsToList (_: v: v.vlan) m.virtualisation.interfaces)) - ) (lib.attrValues config.nodes); + ) ((lib.attrValues config.nodes) ++ (lib.attrValues config.containers)); vms = map (m: m.system.build.vm) (lib.attrValues config.nodes); + containers = map (m: m.system.build.nspawn) (lib.attrValues config.containers); pythonizeName = name: @@ -34,16 +37,28 @@ let vlanNames = map (i: "vlan${toString i}: VLan;") uniqueVlans; vmMachineNames = map (c: c.system.name) (lib.attrValues config.nodes); + containerMachineNames = map (c: c.system.name) (lib.attrValues config.containers); + allMachineNames = + let + overlappingNames = lib.intersectLists vmMachineNames containerMachineNames; + in + assert ( + lib.asserts.assertMsg (overlappingNames == [ ]) "vm names and container names must not overlap" + ); + vmMachineNames ++ containerMachineNames; theOnlyMachine = let exactlyOneMachine = lib.length (lib.attrValues config.nodes) == 1; in - lib.optional (exactlyOneMachine && !lib.elem "machine" vmMachineNames) "machine"; + lib.optional (exactlyOneMachine && !lib.elem "machine" allMachineNames) "machine"; pythonizedVmNames = map pythonizeName (vmMachineNames ++ theOnlyMachine); vmMachineTypeHints = map (name: "${name}: QemuMachine;") pythonizedVmNames; + pythonizedContainerNames = map pythonizeName containerMachineNames; + containerMachineTypeHints = map (name: "${name}: NspawnMachine;") pythonizedContainerNames; + withChecks = lib.warnIf config.skipLint "Linting is disabled"; driver = @@ -67,11 +82,14 @@ let vmNames=(${lib.escapeShellArgs vmMachineNames}) vmStartScripts=(${lib.escapeShellArgs (map lib.getExe vms)}) + containerNames=(${lib.escapeShellArgs containerMachineNames}) + containerStartScripts=(${lib.escapeShellArgs (map lib.getExe containers)}) ${lib.optionalString (!config.skipTypeCheck) '' # prepend type hints so the test script can be type checked with mypy cat "${../test-script-prepend.py}" >> testScriptWithTypes echo "${toString vmMachineTypeHints}" >> testScriptWithTypes + echo "${toString containerMachineTypeHints}" >> testScriptWithTypes echo "${toString vlanNames}" >> testScriptWithTypes echo -n "$testScript" >> testScriptWithTypes @@ -94,7 +112,9 @@ let echo "See https://nixos.org/manual/nixos/stable/#test-opt-skipLint" PYFLAKES_BUILTINS="$( - echo -n ${lib.escapeShellArg (lib.concatStringsSep "," pythonizedVmNames)}, + echo -n ${ + lib.escapeShellArg (lib.concatStringsSep "," (pythonizedVmNames ++ pythonizedContainerNames)) + }, cat ${lib.escapeShellArg "driver-symbols"} )" ${hostPkgs.python3Packages.pyflakes}/bin/pyflakes $out/test-script ''} @@ -104,6 +124,8 @@ let wrapProgram $out/bin/nixos-test-driver \ --set vmStartScripts "''${vmStartScripts[*]}" \ --set vmNames "''${vmNames[*]}" \ + --set containerStartScripts "''${containerStartScripts[*]}" \ + --set containerNames "''${containerNames[*]}" \ --set testScript "$out/test-script" \ --set globalTimeout "${toString config.globalTimeout}" \ --set vlans '${toString vlans}' \ diff --git a/nixos/lib/testing/network.nix b/nixos/lib/testing/network.nix index 9a5facfc2433..99edf16676ae 100644 --- a/nixos/lib/testing/network.nix +++ b/nixos/lib/testing/network.nix @@ -1,11 +1,15 @@ -{ lib, nodes, ... }: +{ + containers, + nodes, + lib, + ... +}: let inherit (lib) attrNames - concatMap + concatMapAttrsStringSep concatMapStrings - flip forEach head listToAttrs @@ -20,22 +24,22 @@ let zipLists ; - nodeNumbers = listToAttrs (zipListsWith nameValuePair (attrNames nodes) (range 1 254)); + nodesAndContainers = + let + nodeNames = lib.attrNames nodes; + containerNames = lib.attrNames containers; + conflictingNames = lib.intersectLists nodeNames containerNames; + message = "`nodes` and `containers` must have unique names. Conflicting names: ${lib.concatStringsSep " " conflictingNames}"; + in + lib.throwIfNot (builtins.length conflictingNames == 0) message (nodes // containers); + + nodeNumbers = listToAttrs (zipListsWith nameValuePair (attrNames nodesAndContainers) (range 1 254)); networkModule = - { - config, - nodes, - pkgs, - ... - }: + { config, ... }: let - qemu-common = import ../qemu-common.nix { inherit (pkgs) lib stdenv; }; - interfaces = lib.attrValues config.virtualisation.allInterfaces; - interfacesNumbered = zipLists interfaces (range 1 255); - # Automatically assign IP addresses to requested interfaces. assignIPs = lib.filter (i: i.assignIP) interfaces; ipInterfaces = forEach assignIPs ( @@ -56,17 +60,6 @@ let } ); - qemuOptions = lib.flatten ( - forEach interfacesNumbered ( - { fst, snd }: qemu-common.qemuNICFlags snd fst.vlan config.virtualisation.test.nodeNumber - ) - ); - udevRules = forEach interfaces ( - interface: - # MAC Addresses for QEMU network devices are lowercase, and udev string comparison is case-sensitive. - ''SUBSYSTEM=="net",ACTION=="add",ATTR{address}=="${toLower (qemu-common.qemuNicMac interface.vlan config.virtualisation.test.nodeNumber)}",NAME="${interface.name}"'' - ); - networkConfig = { networking.hostName = mkDefault config.virtualisation.test.nodeName; @@ -85,10 +78,9 @@ let # interfaces, use the IP address corresponding to # the first interface (i.e. the first network in its # virtualisation.vlans option). - networking.extraHosts = flip concatMapStrings (attrNames nodes) ( - m': + networking.extraHosts = concatMapAttrsStringSep "" ( + m': config: let - config = nodes.${m'}; hostnames = optionalString ( config.networking.domain != null @@ -101,10 +93,7 @@ let + optionalString ( config.networking.primaryIPv6Address != "" ) "${config.networking.primaryIPv6Address} ${hostnames}" - ); - - virtualisation.qemu.options = qemuOptions; - boot.initrd.services.udev.rules = concatMapStrings (x: x + "\n") udevRules; + ) nodesAndContainers; }; in @@ -117,6 +106,31 @@ let }; }; + qemuNetworkModule = + { config, pkgs, ... }: + let + qemu-common = import ../qemu-common.nix { inherit (pkgs) lib stdenv; }; + + interfaces = lib.attrValues config.virtualisation.allInterfaces; + + interfacesNumbered = zipLists interfaces (range 1 255); + + qemuOptions = lib.flatten ( + forEach interfacesNumbered ( + { fst, snd }: qemu-common.qemuNICFlags snd fst.vlan config.virtualisation.test.nodeNumber + ) + ); + udevRules = map ( + interface: + # MAC Addresses for QEMU network devices are lowercase, and udev string comparison is case-sensitive. + ''SUBSYSTEM=="net",ACTION=="add",ATTR{address}=="${toLower (qemu-common.qemuNicMac interface.vlan config.virtualisation.test.nodeNumber)}",NAME="${interface.name}"'' + ) interfaces; + in + { + virtualisation.qemu.options = qemuOptions; + boot.initrd.services.udev.rules = concatMapStrings (x: x + "\n") udevRules; + }; + nodeNumberModule = ( regular@{ config, name, ... }: { @@ -127,7 +141,7 @@ let # We need to force this in specialisations, otherwise it'd be # readOnly = true; description = '' - The `name` in `nodes.`; stable across `specialisations`. + The `name` in `nodes.` and `containers.`; stable across `specialisations`. ''; }; virtualisation.test.nodeNumber = mkOption { @@ -136,7 +150,7 @@ let readOnly = true; default = nodeNumbers.${config.virtualisation.test.nodeName}; description = '' - A unique number assigned for each node in `nodes`. + A unique number assigned for each machine in `nodes` and `containers`. ''; }; @@ -172,5 +186,10 @@ in nodeNumberModule ]; }; + extraBaseNodeModules = { + imports = [ + qemuNetworkModule + ]; + }; }; } diff --git a/nixos/lib/testing/nixos-test-base.nix b/nixos/lib/testing/nixos-test-base.nix index 23358f2185b1..6b518e39ac11 100644 --- a/nixos/lib/testing/nixos-test-base.nix +++ b/nixos/lib/testing/nixos-test-base.nix @@ -7,7 +7,6 @@ let in { imports = [ - ../../modules/virtualisation/qemu-vm.nix ../../modules/testing/test-instrumentation.nix # !!! should only get added for automated test runs { key = "no-manual"; @@ -32,7 +31,9 @@ in # This is mostly a Hydra optimization, so we don't rebuild all the tests every time switch-to-configuration-ng changes. key = "no-switch-to-configuration"; system.switch.enable = mkDefault ( - config.isSpecialisation || config.specialisation != { } || config.virtualisation.installBootLoader + config.isSpecialisation + || config.specialisation != { } + || (!config.boot.isContainer && config.virtualisation.installBootLoader) ); } ) diff --git a/nixos/lib/testing/nodes.nix b/nixos/lib/testing/nodes.nix index 721a3c88b369..b2ee6420d186 100644 --- a/nixos/lib/testing/nodes.nix +++ b/nixos/lib/testing/nodes.nix @@ -2,7 +2,6 @@ testModuleArgs@{ config, lib, hostPkgs, - nodes, options, ... }: @@ -12,12 +11,9 @@ let literalExpression literalMD mapAttrs - mkDefault mkIf mkMerge mkOption - mkForce - optional optionalAttrs types ; @@ -51,13 +47,6 @@ let key = "nodes"; _module.args.nodes = config.nodesCompat; } - ( - { config, ... }: - { - virtualisation.qemu.package = testModuleArgs.config.qemu.package; - virtualisation.host.pkgs = hostPkgs; - } - ) ( { options, ... }: { @@ -73,6 +62,44 @@ let testModuleArgs.config.extraBaseModules ]; }; + baseQemuOS = baseOS.extendModules { + modules = [ + ../../modules/virtualisation/qemu-vm.nix + config.nodeDefaults + { + key = "base-qemu"; + virtualisation.qemu.package = testModuleArgs.config.qemu.package; + virtualisation.host.pkgs = hostPkgs; + } + testModuleArgs.config.extraBaseNodeModules + ]; + }; + baseNspawnOS = baseOS.extendModules { + modules = [ + ../../modules/virtualisation/nspawn-container + config.containerDefaults + ( + { pkgs, ... }: + { + key = "base-nspawn"; + + # PAM requires setuid and doesn't work in the build sandbox. + # https://github.com/NixOS/nix/blob/959c244a1265f4048390f3ad21679219d7b27a99/src/libstore/unix/build/linux-derivation-builder.cc#L63 + services.openssh.settings.UsePAM = false; + + # Gross, insecure hack to make login work. See above. + security.pam.services.login = { + text = '' + auth sufficient ${pkgs.linux-pam}/lib/security/pam_permit.so + account sufficient ${pkgs.linux-pam}/lib/security/pam_permit.so + password sufficient ${pkgs.linux-pam}/lib/security/pam_permit.so + session sufficient ${pkgs.linux-pam}/lib/security/pam_permit.so + ''; + }; + } + ) + ]; + }; # TODO (lib): Dedup with run.nix, add to lib/options.nix mkOneUp = opt: f: lib.mkOverride (opt.highestPrio - 1) (f opt.value); @@ -109,15 +136,37 @@ in node.type = mkOption { type = types.raw; - default = baseOS.type; + default = baseQemuOS.type; internal = true; }; nodes = mkOption { type = types.lazyAttrsOf config.node.type; + default = { }; visible = "shallow"; description = '' - An attribute set of NixOS configuration modules. + An attribute set of NixOS configuration modules representing QEMU vms that can be started during a test. + + The configurations are augmented by the [`defaults`](#test-opt-defaults) option. + + They are assigned network addresses according to the `nixos/lib/testing/network.nix` module. + + A few special options are available, that aren't in a plain NixOS configuration. See [Configuring the nodes](#sec-nixos-test-nodes) + ''; + }; + + container.type = mkOption { + type = types.raw; + default = baseNspawnOS.type; + internal = true; + }; + + containers = mkOption { + type = types.lazyAttrsOf config.container.type; + default = { }; + visible = "shallow"; + description = '' + An attribute set of NixOS configuration modules representing systemd-nspawn containers that can be started during a test. The configurations are augmented by the [`defaults`](#test-opt-defaults) option. @@ -128,6 +177,14 @@ in }; defaults = mkOption { + description = '' + NixOS configuration that is applied to all [{option}`nodes`](#test-opt-nodes) and [{option}`containers`](#test-opt-containers). + ''; + type = types.deferredModule; + default = { }; + }; + + nodeDefaults = mkOption { description = '' NixOS configuration that is applied to all [{option}`nodes`](#test-opt-nodes). ''; @@ -135,7 +192,23 @@ in default = { }; }; + containerDefaults = mkOption { + description = '' + NixOS configuration that is applied to all [{option}`containers`](#test-opt-containers). + ''; + type = types.deferredModule; + default = { }; + }; + extraBaseModules = mkOption { + description = '' + NixOS configuration that, like [{option}`defaults`](#test-opt-defaults), is applied to all [{option}`nodes`](#test-opt-nodes) and [{option}`containers`](#test-opt-containers) and can not be undone with [`specialisation..inheritParentConfig`](https://search.nixos.org/options?show=specialisation.%3Cname%3E.inheritParentConfig&from=0&size=50&sort=relevance&type=packages&query=specialisation). + ''; + type = types.deferredModule; + default = { }; + }; + + extraBaseNodeModules = mkOption { description = '' NixOS configuration that, like [{option}`defaults`](#test-opt-defaults), is applied to all [{option}`nodes`](#test-opt-nodes) and can not be undone with [`specialisation..inheritParentConfig`](https://search.nixos.org/options?show=specialisation.%3Cname%3E.inheritParentConfig&from=0&size=50&sort=relevance&type=packages&query=specialisation). ''; @@ -145,7 +218,7 @@ in node.pkgs = mkOption { description = '' - The Nixpkgs to use for the nodes. + The Nixpkgs to use for the nodes and containers. Setting this will make the `nixpkgs.*` options read-only, to avoid mistakenly testing with a Nixpkgs configuration that diverges from regular use. ''; @@ -160,7 +233,7 @@ in description = '' Whether to make the `nixpkgs.*` options read-only. This is only relevant when [`node.pkgs`](#test-opt-node.pkgs) is set. - Set this to `false` when any of the [`nodes`](#test-opt-nodes) needs to configure any of the `nixpkgs.*` options. This will slow down evaluation of your test a bit. + Set this to `false` when any of the [`nodes`](#test-opt-nodes) or [{option}`containers`](#test-opt-containers) need to configure any of the `nixpkgs.*` options. This will slow down evaluation of your test a bit. ''; type = types.bool; default = config.node.pkgs != null; @@ -188,6 +261,7 @@ in }; config = { + _module.args.containers = config.containers; _module.args.nodes = config.nodesCompat; nodesCompat = mapAttrs ( name: config: @@ -201,6 +275,7 @@ in ) config.nodes; passthru.nodes = config.nodesCompat; + passthru.containers = config.containers; extraDriverArgs = mkIf config.sshBackdoor.enable [ "--dump-vsocks=${toString config.sshBackdoor.vsockOffset}" @@ -211,33 +286,35 @@ in nixpkgs.pkgs = config.node.pkgs; imports = [ ../../modules/misc/nixpkgs/read-only.nix ]; }) - (mkIf config.sshBackdoor.enable ( - let - inherit (config.sshBackdoor) vsockOffset; - in - { config, ... }: - { - services.openssh = { - enable = true; - settings = { - PermitRootLogin = "yes"; - PermitEmptyPasswords = "yes"; - }; + (mkIf config.sshBackdoor.enable { + services.openssh = { + enable = true; + settings = { + PermitRootLogin = "yes"; + PermitEmptyPasswords = "yes"; }; + }; - security.pam.services.sshd = { - allowNullPassword = true; - }; - - virtualisation.qemu.options = [ - "-device vhost-vsock-pci,guest-cid=${ - toString (config.virtualisation.test.nodeNumber + vsockOffset) - }" - ]; - } - )) + security.pam.services.sshd = { + allowNullPassword = true; + }; + }) ]; + nodeDefaults = mkIf config.sshBackdoor.enable ( + let + inherit (config.sshBackdoor) vsockOffset; + in + { config, ... }: + { + virtualisation.qemu.options = [ + "-device vhost-vsock-pci,guest-cid=${ + toString (config.virtualisation.test.nodeNumber + vsockOffset) + }" + ]; + } + ); + # Docs: nixos/doc/manual/development/writing-nixos-tests.section.md /** See https://nixos.org/manual/nixos/unstable#sec-override-nixos-test diff --git a/nixos/lib/testing/run.nix b/nixos/lib/testing/run.nix index e34e585241be..0eefb1b0e38d 100644 --- a/nixos/lib/testing/run.nix +++ b/nixos/lib/testing/run.nix @@ -2,6 +2,7 @@ config, hostPkgs, lib, + containers, options, ... }: @@ -96,6 +97,8 @@ in requiredSystemFeatures = [ "nixos-test" ] + # Containers use systemd-nspawn, which requires pid 0 inside of the sandbox. + ++ lib.optional (builtins.length (lib.attrNames containers) > 0) "uid-range" ++ lib.optional isLinux "kvm" ++ lib.optional isDarwin "apple-virt"; diff --git a/nixos/lib/testing/testScript.nix b/nixos/lib/testing/testScript.nix index bde7b78607b4..4ce368c0b8db 100644 --- a/nixos/lib/testing/testScript.nix +++ b/nixos/lib/testing/testScript.nix @@ -56,11 +56,12 @@ in # reuse memoized config v ) config.nodesCompat; + containers = config.containers; } else config.testScript; - defaults = + nodeDefaults = { config, name, ... }: { # Make sure all derivations referenced by the test diff --git a/nixos/modules/testing/test-instrumentation.nix b/nixos/modules/testing/test-instrumentation.nix index dce59a0a93b7..81cb578bf017 100644 --- a/nixos/modules/testing/test-instrumentation.nix +++ b/nixos/modules/testing/test-instrumentation.nix @@ -86,7 +86,8 @@ in options.testing = { backdoor = lib.mkEnableOption "backdoor service in stage 2" // { - default = true; + # See assertion below for why the backdoor doesn't work with containers. + default = !config.boot.isContainer; }; initrdBackdoor = lib.mkEnableOption '' @@ -105,7 +106,20 @@ in { assertion = cfg.initrdBackdoor -> config.boot.initrd.systemd.enable; message = '' - testing.initrdBackdoor requires boot.initrd.systemd.enable to be enabled. + `testing.initrdBackdoor` requires `boot.initrd.systemd.enable` to be enabled. + ''; + } + { + assertion = config.boot.isContainer -> !cfg.backdoor; + message = '' + `testing.backdoor` uses virtio console, which does not work with + containers (we use `nsenter` instead). + ''; + } + { + assertion = config.boot.isContainer -> !cfg.initrdBackdoor; + message = '' + `testing.initrdBackdoor` does not work with containers as there is no initrd. ''; } ]; From 1bbcf46376e44727de14d235136db84fe9e8967f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 8 Jan 2026 12:24:28 +0100 Subject: [PATCH 06/37] nixos/tests: add nspawn container bittorrent integration test --- nixos/tests/all-tests.nix | 1 + nixos/tests/test-containers-bittorrent.nix | 215 +++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 nixos/tests/test-containers-bittorrent.nix diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix index 39542f1e683a..99ddaeea55d0 100644 --- a/nixos/tests/all-tests.nix +++ b/nixos/tests/all-tests.nix @@ -1592,6 +1592,7 @@ in teleports = runTest ./teleports.nix; temporal = runTest ./temporal.nix; terminal-emulators = handleTest ./terminal-emulators.nix { }; + test-containers-bittorrent = runTest ./test-containers-bittorrent.nix; thanos = runTest ./thanos.nix; thelounge = handleTest ./thelounge.nix { }; tiddlywiki = runTest ./tiddlywiki.nix; diff --git a/nixos/tests/test-containers-bittorrent.nix b/nixos/tests/test-containers-bittorrent.nix new file mode 100644 index 000000000000..aeac191cdc2e --- /dev/null +++ b/nixos/tests/test-containers-bittorrent.nix @@ -0,0 +1,215 @@ +# This test runs a Bittorrent tracker on one machine, and verifies +# that two client machines can download the torrent using +# `aria2c'. The first client (behind a NAT router) downloads +# from the initial seeder running on the tracker. Then we kill the +# initial seeder. The second client downloads from the first client, +# which only works if the first client successfully uses the UPnP-IGD +# protocol to poke a hole in the NAT. + +# We use aria2 as the initial seeder because transmission +# fails in the sandbox because of systemd hardening settings, +# namely MountAPIVFS=yes, so we get the following error: + +# $ journalctl --unit transmission.service +# (n-daemon)[417]: transmission.service: Failed to create destination mount point node '/run/transmission/run/host/.os-release-stage/', ignoring: Read-only file system +# (n-daemon)[417]: transmission.service: Failed to mount /run/systemd/propagate/.os-release-stage to /run/transmission/run/host/.os-release-stage/: No such file or directory +# (n-daemon)[417]: transmission.service: Failed to set up mount namespacing: /run/host/.os-release-stage/: No such file or directory +# (n-daemon)[417]: transmission.service: Failed at step NAMESPACE spawning /nix/store/zfksw9bllp95pl45d1nxmpd2lks42bkj-transmission-4.0.6/bin/transmission-daemon: No such file or directory +# systemd[1]: transmission.service: Main process exited, code=exited, status=226/NAMESPACE + +{ lib, hostPkgs, ... }: + +let + + # Some random file to serve. + file = hostPkgs.hello.src; + + internalRouterAddress = "192.168.3.1"; + internalClient1Address = "192.168.3.2"; + + # cannot use documentation networks (198.51.100.0/24 or 192.0.2.0/24) here + # because miniupnpd recognizes them as such and refuses to work with them + # https://github.com/miniupnp/miniupnp/blob/2a74cb2f27cacf06d2b50c187e8f90aa1f5c2528/miniupnpd/miniupnpd.c#L998 + externalRouterAddress = "80.100.100.1"; + externalClient2Address = "80.100.100.2"; + externalTrackerAddress = "80.100.100.3"; + + download-dir = "/tmp/aria2-downloads"; + peerConfig = + { pkgs, ... }: + { + environment.systemPackages = [ + pkgs.aria2 + pkgs.transmission_4 # only needed for transmission-create + ]; + }; +in + +{ + name = "bittorrent"; + meta = { + maintainers = [ + lib.maintainers.kmein + ]; + }; + + containers = { + tracker = + { pkgs, ... }: + { + imports = [ peerConfig ]; + + virtualisation.vlans = [ 1 ]; + networking.firewall.enable = false; + networking.interfaces.eth1.ipv4.addresses = [ + { + address = externalTrackerAddress; + prefixLength = 24; + } + ]; + + # We need Apache on the tracker to serve the torrents. + services.httpd = { + enable = true; + virtualHosts = { + "torrentserver.org" = { + adminAddr = "foo@example.org"; + documentRoot = "/tmp"; + }; + }; + }; + services.opentracker.enable = true; + }; + + router = + { pkgs, containers, ... }: + { + virtualisation.vlans = [ + 1 + 2 + ]; + networking.nat.enable = true; + networking.nat.internalInterfaces = [ "eth2" ]; + networking.nat.externalInterface = "eth1"; + networking.firewall.enable = true; + networking.firewall.trustedInterfaces = [ "eth2" ]; + networking.interfaces.eth0.ipv4.addresses = [ ]; + networking.interfaces.eth1.ipv4.addresses = [ + { + address = externalRouterAddress; + prefixLength = 24; + } + ]; + networking.interfaces.eth2.ipv4.addresses = [ + { + address = internalRouterAddress; + prefixLength = 24; + } + ]; + networking.nftables.enable = true; + services.miniupnpd = { + enable = true; + externalInterface = "eth1"; + internalIPs = [ "eth2" ]; + appendConfig = '' + ext_ip=${externalRouterAddress} + ''; + }; + }; + + client1 = + { pkgs, containers, ... }: + { + imports = [ peerConfig ]; + environment.systemPackages = [ pkgs.miniupnpc ]; + + virtualisation.vlans = [ 2 ]; + networking.interfaces.eth0.ipv4.addresses = [ ]; + networking.interfaces.eth1.ipv4.addresses = [ + { + address = internalClient1Address; + prefixLength = 24; + } + ]; + networking.defaultGateway = internalRouterAddress; + networking.firewall.enable = false; + }; + + client2 = + { pkgs, ... }: + { + imports = [ peerConfig ]; + + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth0.ipv4.addresses = [ ]; + networking.interfaces.eth1.ipv4.addresses = [ + { + address = externalClient2Address; + prefixLength = 24; + } + ]; + networking.firewall.enable = false; + }; + }; + + testScript = + { containers, ... }: + '' + start_all() + + # Wait for network and miniupnpd. + router.systemctl("start network-online.target") + router.wait_for_unit("network-online.target") + router.wait_for_unit("miniupnpd") + + # Create the torrent. + tracker.succeed("mkdir -p ${download-dir}") + tracker.succeed( + "cp ${file} ${download-dir}/test.tar.bz2" + ) + tracker.succeed( + "transmission-create ${download-dir}/test.tar.bz2 --private --tracker http://${externalTrackerAddress}:6969/announce --outfile /tmp/test.torrent" + ) + tracker.succeed("chmod 644 /tmp/test.torrent") + + # Start the tracker + tracker.systemctl("start network-online.target") + tracker.wait_for_unit("network-online.target") + tracker.wait_for_unit("opentracker.service") + tracker.wait_for_open_port(6969) + + # --- Start the initial seeder using aria2 --- + # https://stackoverflow.com/a/44528978 + tracker.execute( + "aria2c --enable-dht=false --seed-time=999 --dir=${download-dir} " + "-V --seed-ratio=0.0 " + "/tmp/test.torrent >/dev/null &" + ) + + # --- Wait until the tracker shows we are seeding --- + tracker.wait_until_succeeds("curl -s http://localhost:6969/stats | grep -q 'serving 1 torrents'") + + # Now we should be able to download from the client behind the NAT. + tracker.wait_for_unit("httpd") + + def connect_from(machine): + machine.systemctl("start network-online.target") + machine.wait_for_unit("network-online.target") + machine.execute( + "aria2c --enable-dht=false --seed-time=999 --dir=${download-dir} " + "http://${externalTrackerAddress}/test.torrent >/dev/null &" + ) + machine.wait_until_succeeds( + "cmp ${download-dir}/test.tar.bz2 ${file}" + ) # Wait for download to finish and verify + + connect_from(client1) + + # --- Bring down the initial seeder --- + tracker.succeed("pkill aria2c") + + # Now download from the second client. This can only succeed if + # the first client created a NAT hole in the router. + connect_from(client2) + ''; +} From 930f45eb5a22ddfdf0566a05e53a40f532572326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 8 Jan 2026 12:25:00 +0100 Subject: [PATCH 07/37] nixos/tests: add nspawn container integration test Co-authored-by: Jeremy Fleischman --- nixos/tests/all-tests.nix | 1 + nixos/tests/test-containers.nix | 76 +++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 nixos/tests/test-containers.nix diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix index 99ddaeea55d0..15bb03eaf6b5 100644 --- a/nixos/tests/all-tests.nix +++ b/nixos/tests/all-tests.nix @@ -1592,6 +1592,7 @@ in teleports = runTest ./teleports.nix; temporal = runTest ./temporal.nix; terminal-emulators = handleTest ./terminal-emulators.nix { }; + test-containers = runTest ./test-containers.nix; test-containers-bittorrent = runTest ./test-containers-bittorrent.nix; thanos = runTest ./thanos.nix; thelounge = handleTest ./thelounge.nix { }; diff --git a/nixos/tests/test-containers.nix b/nixos/tests/test-containers.nix new file mode 100644 index 000000000000..b5db58658891 --- /dev/null +++ b/nixos/tests/test-containers.nix @@ -0,0 +1,76 @@ +{ pkgs, ... }: +{ + name = "test-containers"; + meta.maintainers = with pkgs.lib.maintainers; [ jfly ]; + + nodes = { + n1 = { + networking.firewall.enable = false; + virtualisation.vlans = [ 1 ]; + }; + n2 = { + networking.firewall.enable = false; + virtualisation.vlans = [ + 2 + ]; + }; + }; + + containers = { + c1 = { + networking.firewall.enable = false; + virtualisation.vlans = [ 1 ]; + }; + c2 = { + networking.firewall.enable = false; + virtualisation.vlans = [ 2 ]; + }; + c12 = { + networking.firewall.enable = false; + virtualisation.vlans = [ + 1 + 2 + ]; + }; + }; + + testScript = /* python */ '' + c1.start() + c2.start() + c12.start() + + c1.succeed("echo hello > /hello.txt") + c1.copy_from_vm("/hello.txt") + + c1.systemctl("start network-online.target") + c2.systemctl("start network-online.target") + c12.systemctl("start network-online.target") + c1.wait_for_unit("network-online.target") + c2.wait_for_unit("network-online.target") + c12.wait_for_unit("network-online.target") + + # Confirm containers in vlan 1 can talk to each other. + c1.succeed("ping -c 1 c12") + c12.succeed("ping -c 1 c1") + + # Confirm containers in vlan 2 can talk to each other. + # <<< c2.succeed("ping -c 1 c12") # <<< TODO: this doesn't work because c12's "primary ip" is for vlan 1 + c12.succeed("ping -c 1 c2") + + # Confirm containers in separate vlans cannot talk to each other. + c1.fail("ping -c 1 -W 1 c2") + + n1.start() + n2.start() + n1.systemctl("start network-online.target") + n2.systemctl("start network-online.target") + n1.wait_for_unit("network-online.target") + n2.wait_for_unit("network-online.target") + + # <<< # Confirm containers and nodes in the same vlan can talk to each other. + # <<< c1.succeed("ping -c 1 n1") + # <<< n1.succeed("ping -c 1 c1") + # <<< c2.succeed("ping -c 1 n2") + # <<< n2.succeed("ping -c 1 c2") + ''; +} From 7c821987452e839db817fab5e5613c73b8c7e5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 12 Jan 2026 09:58:35 +0100 Subject: [PATCH 08/37] nixos/test-driver: fix grammar and typos --- .../test-driver/src/test_driver/machine/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 32d5bc79166e..ba32a853e58c 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -133,7 +133,7 @@ class QemuStartCommand: """A start script from nixos/modules/virtualiation/qemu-vm.nix. These Nix commands have the particular characteristic that the machine name can be extracted out of them via a regex match. - (Admittedly a _very_ implicit contract, evtl. TODO fix) + (Admittedly a _very_ implicit contract, TODO fix this eventually.) """ match = re.search("run-(.+)-vm$", self._cmd) name = "machine" @@ -1466,15 +1466,15 @@ class NspawnMachine(BaseMachine): else: raise RuntimeError(f"Failed to start container {self.name}") - childs = ( + children = ( Path(f"/proc/{systemd_nspawn_pid}/task/{systemd_nspawn_pid}/children") .read_text() .split() ) - assert len(childs) == 1, ( - f"Expected exactly one child process for systemd-nspawn, got {childs}" + assert len(children) == 1, ( + f"Expected exactly one child process for systemd-nspawn, got {children}" ) - (child,) = childs + (child,) = children try: return int(child) @@ -1534,7 +1534,7 @@ class NspawnMachine(BaseMachine): self.pid = self.process.pid - self.log(f"system-nspawn running (pid {self.pid})") + self.log(f"systemd-nspawn running (pid {self.pid})") def wait_for_shutdown(self) -> None: if self.process is None: From 0702405810774722a4cb70ec55ad9a6336438ec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 12 Jan 2026 14:13:13 +0100 Subject: [PATCH 09/37] =?UTF-8?q?nixos/test-driver:=20implement=20containe?= =?UTF-8?q?r=E2=86=94VM=E2=80=AFnetworking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nixos/lib/test-driver/src/test_driver/vlan.py | 13 +++++++++++++ .../run-nspawn/src/run_nspawn/__init__.py | 11 +++++++++++ nixos/tests/test-containers.nix | 16 +++++++++++----- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/vlan.py b/nixos/lib/test-driver/src/test_driver/vlan.py index 89ca33165b4d..eed66dcd13cb 100644 --- a/nixos/lib/test-driver/src/test_driver/vlan.py +++ b/nixos/lib/test-driver/src/test_driver/vlan.py @@ -50,6 +50,8 @@ class VLan: pid: int fd: io.TextIOBase + plug_process: subprocess.Popen + logger: AbstractLogger def __repr__(self) -> str: @@ -58,6 +60,7 @@ class VLan: def __init__(self, nr: int, tmp_dir: Path, logger: AbstractLogger): self.nr = nr self.socket_dir = tmp_dir / f"vde{self.nr}.ctl" + self.tap_name = f"vde-tap{self.nr}" self.logger = logger # TODO: don't side-effect environment here @@ -114,6 +117,13 @@ class VLan: if "1000 Success" in line: break + # This is needed to allow systemd-nspawn containers to communicate + # with VMs connected to the VLAN. + self.logger.info(f"creating tap interface {self.tap_name}") + self.plug_process = subprocess.Popen( + ["vde_plug2tap", "-s", self.socket_dir, self.tap_name], + ) + assert (self.socket_dir / "ctl").exists(), "cannot start vde_switch" self.logger.info(f"running vlan (pid {self.pid}; ctl {self.socket_dir})") @@ -122,4 +132,7 @@ class VLan: self.logger.info(f"kill vlan (pid {self.pid})") assert self.process.stdin is not None self.process.stdin.close() + if self.plug_process: + self.plug_process.terminate() + self.plug_process.wait() self.process.terminate() diff --git a/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py b/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py index d6fe8f4958fc..ed292912d16d 100644 --- a/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py +++ b/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py @@ -68,7 +68,9 @@ def ensure_vlan_bridge(vlan: int) -> typing.Generator[str, None, None]: ipv6_addr = f"2001:db8:{vlan}::fe/64" bridge_name = f"br{vlan}" + tap_name = f"vde-tap{vlan}" bridge_path = Path("/sys/class/net") / bridge_name + tap_path = Path("/sys/class/net") / tap_name try: # To avoid racing against other nspawn containers that also # need this vlan, grab an exclusive lock. @@ -80,6 +82,15 @@ def ensure_vlan_bridge(vlan: int) -> typing.Generator[str, None, None]: run_ip("addr", "add", ipv4_addr, "dev", bridge_name) run_ip("addr", "add", ipv6_addr, "dev", bridge_name) + if tap_path.exists(): + logger.info(f"attaching {tap_name} to {bridge_name}") + run_ip("link", "set", tap_name, "master", bridge_name) + run_ip("link", "set", tap_name, "up") + else: + logger.warning( + f"TAP {tap_name} not found; container will be isolated from VDE" + ) + yield bridge_name finally: # To avoid racing against other nspawn containers that also diff --git a/nixos/tests/test-containers.nix b/nixos/tests/test-containers.nix index b5db58658891..94877e2d62fb 100644 --- a/nixos/tests/test-containers.nix +++ b/nixos/tests/test-containers.nix @@ -67,10 +67,16 @@ n1.wait_for_unit("network-online.target") n2.wait_for_unit("network-online.target") - # <<< # Confirm containers and nodes in the same vlan can talk to each other. - # <<< c1.succeed("ping -c 1 n1") - # <<< n1.succeed("ping -c 1 c1") - # <<< c2.succeed("ping -c 1 n2") - # <<< n2.succeed("ping -c 1 c2") + # Confirm containers and nodes in the same vlan can talk to each other. + c1.succeed("ping -c 1 n1") + n1.succeed("ping -c 1 c1") + c2.succeed("ping -c 1 n2") + n2.succeed("ping -c 1 c2") + + # Confirm containers and nodes in different vlans cannot talk to each other. + c1.fail("ping -c 1 -W 1 n2") + n1.fail("ping -c 1 -W 1 c2") + c2.fail("ping -c 1 -W 1 n1") + n2.fail("ping -c 1 -W 1 c1") ''; } From d90e24b2385d61c9fe29e6586e17c8754fa95816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 13 Jan 2026 16:03:59 +0100 Subject: [PATCH 10/37] nixos/test-driver: deprecate --keep-vm-state in favour of --keep-machine-state --- .../test-driver/src/test_driver/__init__.py | 26 +++++++++++++++---- .../lib/test-driver/src/test_driver/driver.py | 10 +++---- .../src/test_driver/machine/__init__.py | 25 +++++++++--------- nixos/lib/test-script-prepend.py | 3 ++- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 9c465af778c7..166562105e99 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -1,6 +1,8 @@ import argparse import os +import sys import time +import warnings from pathlib import Path import ptpython.ipython @@ -55,9 +57,15 @@ def writeable_dir(arg: str) -> Path: def main() -> None: arg_parser = argparse.ArgumentParser(prog="nixos-test-driver") arg_parser.add_argument( - "-K", "--keep-vm-state", - help="re-use a VM state coming from a previous run", + help=argparse.SUPPRESS, + dest="keep_machine_state", + action="store_true", + ) + arg_parser.add_argument( + "-K", + "--keep-machine-state", + help="re-use a machine state coming from a previous run", action="store_true", ) arg_parser.add_argument( @@ -146,6 +154,12 @@ def main() -> None: args = arg_parser.parse_args() + if "--keep-vm-state" in sys.argv: + warnings.warn( + "The flag '--keep-vm-state' is deprecated. Use '--keep-machine-state' instead.", + DeprecationWarning, + ) + output_directory = args.output_directory.resolve() logger = CompositeLogger([TerminalLogger()]) @@ -155,8 +169,10 @@ def main() -> None: if args.junit_xml: logger.add_logger(JunitXMLLogger(output_directory / args.junit_xml)) - if not args.keep_vm_state: - logger.info("Machine state will be reset. To keep it, pass --keep-vm-state") + if not args.keep_machine_state: + logger.info( + "Machine state will be reset. To keep it, pass --keep-machine-state" + ) debugger: DebugAbstract = DebugNop() if args.debug_hook_attach is not None: @@ -180,7 +196,7 @@ def main() -> None: tests=args.testscript.read_text(), out_dir=output_directory, logger=logger, - keep_vm_state=args.keep_vm_state, + keep_machine_state=args.keep_machine_state, global_timeout=args.global_timeout, debug=debugger, ) as driver: diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index cc04d6b5b707..0e665d608700 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -87,7 +87,7 @@ class Driver: tests: str, out_dir: Path, logger: AbstractLogger, - keep_vm_state: bool = False, + keep_machine_state: bool = False, global_timeout: int = 24 * 60 * 60 * 7, debug: DebugAbstract = DebugNop(), ): @@ -110,7 +110,7 @@ class Driver: QemuMachine( name=name, start_command=vm_start_script, - keep_vm_state=keep_vm_state, + keep_machine_state=keep_machine_state, tmp_dir=tmp_dir, callbacks=[self.check_polling_conditions], out_dir=self.out_dir, @@ -130,7 +130,7 @@ class Driver: start_command=container_start_script, tmp_dir=tmp_dir, logger=self.logger, - keep_vm_state=keep_vm_state, + keep_machine_state=keep_machine_state, callbacks=[self.check_polling_conditions], out_dir=self.out_dir, ) @@ -360,7 +360,7 @@ class Driver: start_command: str, *, name: str | None = None, - keep_vm_state: bool = False, + keep_machine_state: bool = False, ) -> BaseMachine: """ Create a `QemuMachine`. This currently only supports qemu "nodes", not containers. @@ -372,7 +372,7 @@ class Driver: out_dir=self.out_dir, start_command=start_command, name=name, - keep_vm_state=keep_vm_state, + keep_machine_state=keep_machine_state, logger=self.logger, ) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index ba32a853e58c..94f09a8fa240 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -14,6 +14,7 @@ import sys import tempfile import threading import time +import warnings from abc import ABC, abstractmethod from collections.abc import Callable, Generator from contextlib import _GeneratorContextManager, contextmanager, nullcontext @@ -220,7 +221,7 @@ class BaseMachine(ABC): name: str callbacks: list[Callable] tmp_dir: Path - keep_vm_state: bool + keep_machine_state: bool def __repr__(self) -> str: return f"<{self.__class__.__name__} '{self.name}'>" @@ -232,7 +233,7 @@ class BaseMachine(ABC): logger: AbstractLogger, tmp_dir: Path, callbacks: list[Callable] | None, - keep_vm_state: bool, + keep_machine_state: bool, ) -> None: self.out_dir = out_dir self.name = name @@ -240,12 +241,10 @@ class BaseMachine(ABC): self.callbacks = callbacks if callbacks is not None else [] self.tmp_dir = tmp_dir - # Note: "vm" is a bit of a misnomer here as we support both QEMU vms and nspawn containers. - # Consider renaming to something more generic ("machine"?) - self.keep_vm_state = keep_vm_state + self.keep_machine_state = keep_machine_state self.state_dir = self.tmp_dir / f"vm-state-{self.name}" - if (not self.keep_vm_state) and self.state_dir.exists(): + if (not self.keep_machine_state) and self.state_dir.exists(): self.cleanup_statedir() self.state_dir.mkdir(mode=0o700, exist_ok=True) @@ -617,8 +616,10 @@ class BaseMachine(ABC): def cleanup_statedir(self) -> None: shutil.rmtree(self.state_dir) - self.logger.log(f"deleting VM state directory {self.state_dir}") - self.logger.log("if you want to keep the VM state, pass --keep-vm-state") + self.logger.log(f"deleting machine state directory {self.state_dir}") + self.logger.log( + "if you want to keep the machine state, pass --keep-machine-state" + ) def copy_from_vm(self, source: str, target_dir: str = "") -> None: """Copy a file from the VM (specified by an in-VM source path) to a path @@ -725,7 +726,7 @@ class QemuMachine(BaseMachine): start_command: str, logger: AbstractLogger, name: str | None = None, - keep_vm_state: bool = False, + keep_machine_state: bool = False, callbacks: list[Callable] | None = None, ) -> None: self.start_command = QemuStartCommand(start_command) @@ -735,7 +736,7 @@ class QemuMachine(BaseMachine): logger=logger, callbacks=callbacks, tmp_dir=tmp_dir, - keep_vm_state=keep_vm_state, + keep_machine_state=keep_machine_state, ) self.full_console_log = [] @@ -1399,7 +1400,7 @@ class NspawnMachine(BaseMachine): tmp_dir: Path, logger: AbstractLogger, callbacks: list[Callable] | None = None, - keep_vm_state: bool = False, + keep_machine_state: bool = False, ): # TODO: don't compute `name` from `start_command` path, instead thread it down explicitly. # See analogous TODO in `QemuStartCommand::machine_name`. @@ -1409,7 +1410,7 @@ class NspawnMachine(BaseMachine): logger=logger, callbacks=callbacks, tmp_dir=tmp_dir, - keep_vm_state=keep_vm_state, + keep_machine_state=keep_machine_state, ) self.start_command = start_command diff --git a/nixos/lib/test-script-prepend.py b/nixos/lib/test-script-prepend.py index e06a210d6dd7..b9a12d5f2f54 100644 --- a/nixos/lib/test-script-prepend.py +++ b/nixos/lib/test-script-prepend.py @@ -34,7 +34,8 @@ class CreateMachineProtocol(Protocol): start_command: str | dict, *, name: Optional[str] = None, - keep_vm_state: bool = False, + keep_machine_state: bool = False, + **kwargs: Any, # to allow usage of deprecated keep_vm_state ) -> BaseMachine: raise Exception("This is just type information for the Nix test driver") From 47b08df7600746d3d5150c86d42af7c158924a68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 13 Jan 2026 16:04:27 +0100 Subject: [PATCH 11/37] nixos/test-driver: deprecate copy_from_vm in favour of copy_from_machine --- .../test-driver/src/test_driver/__init__.py | 4 ++-- .../src/test_driver/machine/__init__.py | 18 +++++++++++------- nixos/tests/test-containers.nix | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 166562105e99..733e92d894ec 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -129,8 +129,8 @@ def main() -> None: arg_parser.add_argument( "-o", "--output_directory", - help="""The path to the directory where outputs copied from the VM will be placed. - By e.g. Machine.copy_from_vm or Machine.screenshot""", + help="""The path to the directory where outputs copied from the machine will be placed. + By e.g. NspawnMachine.copy_from_machine or QemuMachine.screenshot""", default=Path.cwd(), type=writeable_dir, ) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 94f09a8fa240..3fe18676dabf 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -621,10 +621,10 @@ class BaseMachine(ABC): "if you want to keep the machine state, pass --keep-machine-state" ) - def copy_from_vm(self, source: str, target_dir: str = "") -> None: - """Copy a file from the VM (specified by an in-VM source path) to a path + def copy_from_machine(self, source: str, target_dir: str = "") -> None: + """Copy a file from the machine (specified by an in-machine source path) to a path relative to `$out`. The file is copied via the `shared_dir` shared among - all the VMs (using a temporary directory). + all the machines (using a temporary directory). """ # Compute the source, target, and intermediate shared file names vm_src = Path(source) @@ -633,17 +633,21 @@ class BaseMachine(ABC): vm_shared_temp = Path("/tmp/shared") / shared_temp.name vm_intermediate = vm_shared_temp / vm_src.name intermediate = shared_temp / vm_src.name - # Copy the file to the shared directory inside VM + # Copy the file to the shared directory inside machines self.succeed(make_command(["mkdir", "-p", vm_shared_temp])) self.succeed(make_command(["cp", "-r", vm_src, vm_intermediate])) abs_target = self.out_dir / target_dir / vm_src.name abs_target.parent.mkdir(exist_ok=True, parents=True) - # Copy the file from the shared directory outside VM + # Copy the file from the shared directory outside machines if intermediate.is_dir(): shutil.copytree(intermediate, abs_target) else: shutil.copy(intermediate, abs_target) + @warnings.deprecated("Use copy_from_machine() instead") + def copy_from_vm(self, source: str, target_dir: str = "") -> None: + self.copy_from_machine(source, target_dir) + def copy_from_host_via_shell(self, source: str, target: str) -> None: """Copy a file from the host into the guest by piping it over the shell into the destination file. Works without host-guest shared folder. @@ -669,7 +673,7 @@ class BaseMachine(ABC): be written to. The file is copied via the `shared_dir` directory which is shared among - all the VMs (using a temporary directory). + all the machines (using a temporary directory). The access rights bits will mimic the ones from the host file and user:group will be root:root. """ @@ -833,7 +837,7 @@ class QemuMachine(BaseMachine): Takes an optional parameter `check_return` that defaults to `True`. Setting this parameter to `False` will not check for the return code and return -1 instead. This can be used for commands that shut down - the VM and would therefore break the pipe that would be used for + the machine and would therefore break the pipe that would be used for retrieving the return code. A timeout for the command can be specified (in seconds) using the optional diff --git a/nixos/tests/test-containers.nix b/nixos/tests/test-containers.nix index 94877e2d62fb..5f393966f1ff 100644 --- a/nixos/tests/test-containers.nix +++ b/nixos/tests/test-containers.nix @@ -40,7 +40,7 @@ c12.start() c1.succeed("echo hello > /hello.txt") - c1.copy_from_vm("/hello.txt") + c1.copy_from_machine("/hello.txt") c1.systemctl("start network-online.target") c2.systemctl("start network-online.target") From 898fc1dfed4243abab99c3423912d23352f5f82b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 19 Jan 2026 08:53:07 +0100 Subject: [PATCH 12/37] nixos/tests: move container integration test to nixos-test-driver.containers --- nixos/tests/all-tests.nix | 2 +- .../{test-containers.nix => nixos-test-driver/containers.nix} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename nixos/tests/{test-containers.nix => nixos-test-driver/containers.nix} (98%) diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix index 15bb03eaf6b5..f96dc0fb8854 100644 --- a/nixos/tests/all-tests.nix +++ b/nixos/tests/all-tests.nix @@ -168,6 +168,7 @@ in node-name = runTest ./nixos-test-driver/node-name.nix; busybox = runTest ./nixos-test-driver/busybox.nix; console-log = runTest ./nixos-test-driver/console-log.nix; + containers = runTest ./nixos-test-driver/containers.nix; driver-timeout = pkgs.runCommand "ensure-timeout-induced-failure" { @@ -1592,7 +1593,6 @@ in teleports = runTest ./teleports.nix; temporal = runTest ./temporal.nix; terminal-emulators = handleTest ./terminal-emulators.nix { }; - test-containers = runTest ./test-containers.nix; test-containers-bittorrent = runTest ./test-containers-bittorrent.nix; thanos = runTest ./thanos.nix; thelounge = handleTest ./thelounge.nix { }; diff --git a/nixos/tests/test-containers.nix b/nixos/tests/nixos-test-driver/containers.nix similarity index 98% rename from nixos/tests/test-containers.nix rename to nixos/tests/nixos-test-driver/containers.nix index 5f393966f1ff..f80fca9f5591 100644 --- a/nixos/tests/test-containers.nix +++ b/nixos/tests/nixos-test-driver/containers.nix @@ -1,6 +1,6 @@ { pkgs, ... }: { - name = "test-containers"; + name = "containers"; meta.maintainers = with pkgs.lib.maintainers; [ jfly ]; nodes = { From 4cd7413051031311beb7177615eec9fdca45fd26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 27 Jan 2026 16:11:17 +0100 Subject: [PATCH 13/37] nixos/test-driver: rename machine properties to machines_{qemu,nspawn} --- nixos/lib/test-driver/src/test_driver/driver.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index 0e665d608700..8fce372eec9c 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -69,8 +69,8 @@ class Driver: tests: str vlans: list[VLan] - vm_machines: list[QemuMachine] - container_machines: list[NspawnMachine] + machines_qemu: list[QemuMachine] + machines_nspawn: list[NspawnMachine] polling_conditions: list[PollingCondition] global_timeout: int race_timer: threading.Timer @@ -106,7 +106,7 @@ class Driver: self.polling_conditions = [] - self.vm_machines = [ + self.machines_qemu = [ QemuMachine( name=name, start_command=vm_start_script, @@ -124,7 +124,7 @@ class Driver: if len(container_start_scripts) > 0: self._init_nspawn_environment() - self.container_machines = [ + self.machines_nspawn = [ NspawnMachine( name=name, start_command=container_start_script, @@ -188,7 +188,7 @@ class Driver: @property def machines(self) -> list[QemuMachine | NspawnMachine]: - machines = self.vm_machines + self.container_machines + machines = self.machines_qemu + self.machines_nspawn # Sort the machines by name for consistency with `nodesAndContainers` in . machines.sort(key=lambda machine: machine.name) return machines @@ -228,8 +228,8 @@ class Driver: general_symbols = dict( start_all=self.start_all, test_script=self.test_script, - vm_machines=self.vm_machines, - container_machines=self.container_machines, + machines_qemu=self.machines_qemu, + machines_nspawn=self.machines_nspawn, vlans=self.vlans, driver=self, log=self.logger, From 1dbb1741f2ac20c1a703c7db6d370d4031634160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 27 Jan 2026 16:21:44 +0100 Subject: [PATCH 14/37] nixos/test-driver: improve overlapping machine name warning --- nixos/lib/testing/driver.nix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nixos/lib/testing/driver.nix b/nixos/lib/testing/driver.nix index db56f55307c8..8a3f40e7c44a 100644 --- a/nixos/lib/testing/driver.nix +++ b/nixos/lib/testing/driver.nix @@ -43,7 +43,8 @@ let overlappingNames = lib.intersectLists vmMachineNames containerMachineNames; in assert ( - lib.asserts.assertMsg (overlappingNames == [ ]) "vm names and container names must not overlap" + lib.asserts.assertMsg (overlappingNames == [ ]) + "Names of QEMU VM nodes and systemd-nspawn containers must not overlap. Overlapping names: ${toString overlappingNames}" ); vmMachineNames ++ containerMachineNames; From 8e0487c5a389a30916b777c013ca5a3df07d49d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 28 Jan 2026 09:59:18 +0100 Subject: [PATCH 15/37] nixos/test-driver: use systemd container interface for SSH backdoor --- .../src/test_driver/machine/__init__.py | 18 ++++-------------- nixos/lib/testing/run.nix | 1 + 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 3fe18676dabf..e78d9af4567a 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1,6 +1,5 @@ import base64 import io -import json import os import platform import queue @@ -1422,19 +1421,10 @@ class NspawnMachine(BaseMachine): self.pid = None def ssh_backdoor_command(self, index: int) -> str: - # get IP from `ip addr` inside the container: - ip_status, ip_output = self._execute("ip -j addr show") - assert ip_status == 0, "Failed to get IP addresses from container" - ip_output_data = json.loads(ip_output) - ip_addresses = [ - addr_info.get("local") - for iface in ip_output_data - if iface.get("ifname") != "lo" - for addr_info in iface.get("addr_info", []) - if addr_info.get("family") == "inet" - ] - - return "\n".join(f"ssh -o User=root {addr}" for addr in ip_addresses) + # documented in systemd-ssh-generator(8) and https://systemd.io/CONTAINER_INTERFACE/ + socket_path = f"/run/systemd/nspawn/unix-export/{self.name}/ssh" + proxy_cmd = f"socat - UNIX-CLIENT:{socket_path}" + return f'ssh -o User=root -o ProxyCommand="{proxy_cmd}" bash' def release(self) -> None: if self.pid is None: diff --git a/nixos/lib/testing/run.nix b/nixos/lib/testing/run.nix index 0eefb1b0e38d..646832f71e62 100644 --- a/nixos/lib/testing/run.nix +++ b/nixos/lib/testing/run.nix @@ -105,6 +105,7 @@ in nativeBuildInputs = lib.optionals config.enableDebugHook [ hostPkgs.openssh hostPkgs.inetutils + hostPkgs.socat # to allow SSH backdoor connections for systemd-nspawn containers ]; buildCommand = '' From 0d1a3536000b845bf407910083286720fb310fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 28 Jan 2026 10:44:09 +0100 Subject: [PATCH 16/37] nixos/test-driver: provide all machines from read-only allMachines option --- nixos/lib/testing/driver.nix | 10 +--------- nixos/lib/testing/network.nix | 19 +++++-------------- nixos/lib/testing/nodes.nix | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/nixos/lib/testing/driver.nix b/nixos/lib/testing/driver.nix index 8a3f40e7c44a..864fc9058a95 100644 --- a/nixos/lib/testing/driver.nix +++ b/nixos/lib/testing/driver.nix @@ -38,19 +38,11 @@ let vmMachineNames = map (c: c.system.name) (lib.attrValues config.nodes); containerMachineNames = map (c: c.system.name) (lib.attrValues config.containers); - allMachineNames = - let - overlappingNames = lib.intersectLists vmMachineNames containerMachineNames; - in - assert ( - lib.asserts.assertMsg (overlappingNames == [ ]) - "Names of QEMU VM nodes and systemd-nspawn containers must not overlap. Overlapping names: ${toString overlappingNames}" - ); - vmMachineNames ++ containerMachineNames; theOnlyMachine = let exactlyOneMachine = lib.length (lib.attrValues config.nodes) == 1; + allMachineNames = map (c: c.system.name) (lib.attrValues config.allMachines); in lib.optional (exactlyOneMachine && !lib.elem "machine" allMachineNames) "machine"; diff --git a/nixos/lib/testing/network.nix b/nixos/lib/testing/network.nix index 99edf16676ae..ff3c57412546 100644 --- a/nixos/lib/testing/network.nix +++ b/nixos/lib/testing/network.nix @@ -1,6 +1,4 @@ -{ - containers, - nodes, +testModuleArgs@{ lib, ... }: @@ -24,16 +22,9 @@ let zipLists ; - nodesAndContainers = - let - nodeNames = lib.attrNames nodes; - containerNames = lib.attrNames containers; - conflictingNames = lib.intersectLists nodeNames containerNames; - message = "`nodes` and `containers` must have unique names. Conflicting names: ${lib.concatStringsSep " " conflictingNames}"; - in - lib.throwIfNot (builtins.length conflictingNames == 0) message (nodes // containers); - - nodeNumbers = listToAttrs (zipListsWith nameValuePair (attrNames nodesAndContainers) (range 1 254)); + nodeNumbers = listToAttrs ( + zipListsWith nameValuePair (attrNames testModuleArgs.config.allMachines) (range 1 254) + ); networkModule = { config, ... }: @@ -93,7 +84,7 @@ let + optionalString ( config.networking.primaryIPv6Address != "" ) "${config.networking.primaryIPv6Address} ${hostnames}" - ) nodesAndContainers; + ) testModuleArgs.config.allMachines; }; in diff --git a/nixos/lib/testing/nodes.nix b/nixos/lib/testing/nodes.nix index b2ee6420d186..ceec0b8fab7f 100644 --- a/nixos/lib/testing/nodes.nix +++ b/nixos/lib/testing/nodes.nix @@ -176,6 +176,25 @@ in ''; }; + allMachines = mkOption { + readOnly = true; + internal = true; + description = '' + Basically a merge of [{option}`nodes`](#test-opt-nodes) and [{option}`containers`](#test-opt-containers). + + This ensures that there are no name collisions between nodes and containers. + ''; + default = + let + overlappingNames = lib.intersectLists (lib.attrNames config.nodes) ( + lib.attrNames config.containers + ); + in + lib.throwIfNot (overlappingNames == [ ]) + "The following names are used in both `nodes` and `containers`: ${lib.concatStringsSep ", " overlappingNames}" + (config.nodes // config.containers); + }; + defaults = mkOption { description = '' NixOS configuration that is applied to all [{option}`nodes`](#test-opt-nodes) and [{option}`containers`](#test-opt-containers). From 884c63fca9936fe06d00237bfeb7616f868c6d86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 28 Jan 2026 11:14:50 +0100 Subject: [PATCH 17/37] nixos/test-driver: remove superfluous machine methods --- .../src/test_driver/machine/__init__.py | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index e78d9af4567a..9a80e2b88c6a 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -551,40 +551,6 @@ class BaseMachine(ABC): """ return self.systemctl(f"stop {jobname}", user) - def wait_for_job(self, jobname: str) -> None: - self.wait_for_unit(jobname) - - def get_tty_text(self, tty: str) -> str: - """ - Get the output printed to a given TTY. - """ - status, output = self.execute( - f"fold -w$(stty -F /dev/tty{tty} size | awk '{{print $2}}') /dev/vcs{tty}" - ) - return output - - def wait_until_tty_matches(self, tty: str, regexp: str, timeout: int = 900) -> None: - """Wait until the visible output on the chosen TTY matches regular - expression. Throws an exception on timeout. - """ - matcher = re.compile(regexp) - - def tty_matches(last_try: bool) -> bool: - text = self.get_tty_text(tty) - if last_try: - self.log( - f"Last chance to match /{regexp}/ on TTY{tty}, " - f"which currently contains: {text}" - ) - return len(matcher.findall(text)) > 0 - - with self.nested(f"waiting for {regexp} to appear on tty {tty}"): - retry(tty_matches, timeout) - - def dump_tty_contents(self, tty: str) -> None: - """Debugging: Dump the contents of the TTY""" - self.execute(f"fold -w 80 /dev/vcs{tty} | systemd-cat") - def execute( self, command: str, @@ -804,6 +770,37 @@ class QemuMachine(BaseMachine): break return "".join(output_buffer) + def get_tty_text(self, tty: str) -> str: + """ + Get the output printed to a given TTY. + """ + status, output = self.execute( + f"fold -w$(stty -F /dev/tty{tty} size | awk '{{print $2}}') /dev/vcs{tty}" + ) + return output + + def wait_until_tty_matches(self, tty: str, regexp: str, timeout: int = 900) -> None: + """Wait until the visible output on the chosen TTY matches regular + expression. Throws an exception on timeout. + """ + matcher = re.compile(regexp) + + def tty_matches(last_try: bool) -> bool: + text = self.get_tty_text(tty) + if last_try: + self.log( + f"Last chance to match /{regexp}/ on TTY{tty}, " + f"which currently contains: {text}" + ) + return len(matcher.findall(text)) > 0 + + with self.nested(f"waiting for {regexp} to appear on tty {tty}"): + retry(tty_matches, timeout) + + def dump_tty_contents(self, tty: str) -> None: + """Debugging: Dump the contents of the TTY""" + self.execute(f"fold -w 80 /dev/vcs{tty} | systemd-cat") + def _execute( self, command: str, From a4fa7a935f321696274c5a0df4a1c76138acf6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 28 Jan 2026 15:04:34 +0100 Subject: [PATCH 18/37] nixos/test-driver: stream nspawn container journal --- .../src/test_driver/machine/__init__.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 9a80e2b88c6a..a78ebb721986 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1510,6 +1510,52 @@ class NspawnMachine(BaseMachine): ) return (cp.returncode, cp.stdout) + def _stream_journal(self) -> None: + assert self.process is not None, "Container not started" + journal_path = self.state_dir / "var/log/journal" + + # 1. Wait for the directory to actually be created by the container + self.log(f"Waiting for journal at {journal_path}...") + max_attempts = 10 + attempts = 0 + while not journal_path.exists() and attempts < max_attempts: + time.sleep(1) + attempts += 1 + + if not journal_path.exists(): + self.log(f"Error: Journal directory {journal_path} never appeared.") + return + + # 2. Start the journalctl process + # Using a loop here handles cases where journalctl might exit unexpectedly + while self.process.poll() is None: # While the container is still running + with subprocess.Popen( + ["journalctl", "-f", "-D", journal_path, "-o", "short-monotonic"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, # Line buffered. + ) as log_proc: + assert log_proc.stdout is not None, ( + "Failed to capture journalctl output" + ) + try: + for line in iter(log_proc.stdout.readline, ""): + if line: + self.log_serial(line.rstrip()) + if self.process.poll() is not None: + break + except Exception as e: + self.log(f"Error while reading journalctl output: {e}") + finally: + log_proc.terminate() + log_proc.wait() + + # If we reach here, journalctl stopped while the container is still running. + # Wait a moment before retrying to avoid CPU pegging if something is wrong. + if self.process.poll() is None: + time.sleep(1) + def start(self) -> None: if self.process is not None: return @@ -1528,6 +1574,9 @@ class NspawnMachine(BaseMachine): self.log(f"systemd-nspawn running (pid {self.pid})") + journal_thread = threading.Thread(target=self._stream_journal, daemon=True) + journal_thread.start() + def wait_for_shutdown(self) -> None: if self.process is None: return From 0e1d359e467f40cc90f25d12dbff09df7f1520d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 28 Jan 2026 15:31:50 +0100 Subject: [PATCH 19/37] nixos/test-driver: add machine name to all log output --- nixos/lib/test-driver/src/test_driver/machine/__init__.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index a78ebb721986..373b0f4d5a1d 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -581,10 +581,8 @@ class BaseMachine(ABC): def cleanup_statedir(self) -> None: shutil.rmtree(self.state_dir) - self.logger.log(f"deleting machine state directory {self.state_dir}") - self.logger.log( - "if you want to keep the machine state, pass --keep-machine-state" - ) + self.log(f"deleting machine state directory {self.state_dir}") + self.log("if you want to keep the machine state, pass --keep-machine-state") def copy_from_machine(self, source: str, target_dir: str = "") -> None: """Copy a file from the machine (specified by an in-machine source path) to a path @@ -1443,7 +1441,7 @@ class NspawnMachine(BaseMachine): systemd_nspawn_pid = None for line_bytes in self.process.stdout: line = line_bytes.decode() - print(line, end="") + self.log(line.rstrip()) systemd_nspawn_pid_prefix = "systemd-nspawn's PID is " if line.startswith(systemd_nspawn_pid_prefix): From 2e959aba47807ca500d27fb545a51b34bfc1d079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 28 Jan 2026 15:32:21 +0100 Subject: [PATCH 20/37] nixos/test-driver: do not set -x commands in nspawn containers --- nixos/lib/test-driver/src/test_driver/machine/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 373b0f4d5a1d..52ab43b6d79d 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1485,7 +1485,7 @@ class NspawnMachine(BaseMachine): assert nsenter is not None # Pull in /etc/profile, and some shell sanity. - command = f"set -eo pipefail; source /etc/profile; set -xu; {command}" + command = f"set -eo pipefail; source /etc/profile; set -u; {command}" cp = subprocess.run( [ nsenter, From f1d0d7adae71793f805279a65f2937e846f9c822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Fri, 6 Feb 2026 15:41:33 +0100 Subject: [PATCH 21/37] nixos/test-driver: document rationale for creating /etc/os-release Co-authored-by: Robert Hensing --- nixos/lib/test-driver/src/test_driver/driver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index 8fce372eec9c..6b01828d270c 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -173,7 +173,9 @@ class Driver: ["mount", "-t", "cgroup2", "none", "/sys/fs/cgroup"], check=True ) - # ensure /etc/os-release exists + # systemd-nspawn requires that /etc/os-release exists + # It supports SYSTEMD_NSPAWN_CHECK_OS_RELEASE=0, but that + # would try to "fix" it by bind mounting, which is worse. if not os.path.isfile("/etc/os-release"): subprocess.run(["touch", "/etc/os-release"], check=True) From 019697db058b6738eda0cb2cdcc9f5c014c914cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Fri, 6 Feb 2026 17:04:19 +0100 Subject: [PATCH 22/37] nixos/test-driver: document implications of sourcing /etc/profile with nspawn machines --- nixos/lib/test-driver/src/test_driver/machine/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 52ab43b6d79d..664b2aae3906 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1484,8 +1484,14 @@ class NspawnMachine(BaseMachine): nsenter = shutil.which("nsenter") assert nsenter is not None - # Pull in /etc/profile, and some shell sanity. + # Sourcing /etc/profile on every call of `_execute` ensures a correct shell + # environment (correct PATH, etc.). This is slower than the QEMU version. + # + # NOTE If the test calls switch-to-configuration (with a differently configured specialization) + # this will use the /etc/profile of the new specialisation while `QemuMachine` nodes + # will continue to use the original /etc/profile. command = f"set -eo pipefail; source /etc/profile; set -u; {command}" + cp = subprocess.run( [ nsenter, From af0238c70efc80925c31d293da3bf8affca6437f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 9 Feb 2026 16:42:44 +0100 Subject: [PATCH 23/37] nixos/virtualisation: assert nspawn container name length If the names for systemd-nspawn containers are too long, the generated bridge interface names will surpass the kernel limit IFNAMSIZ (15 characters + '\0'). --- .../nspawn-container/default.nix | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nixos/modules/virtualisation/nspawn-container/default.nix b/nixos/modules/virtualisation/nspawn-container/default.nix index 6f303eec16df..56bcb13e68bf 100644 --- a/nixos/modules/virtualisation/nspawn-container/default.nix +++ b/nixos/modules/virtualisation/nspawn-container/default.nix @@ -70,6 +70,37 @@ in config = { boot.isNspawnContainer = true; + assertions = [ + { + # Check every interface defined in allInterfaces. + # Containers try to create a bridge "${config.system.name}-${interfaceName}" + assertion = lib.all ( + iface: + let + hostName = "${config.system.name}-${iface.name}"; + in + lib.stringLength hostName <= 15 + ) (lib.attrValues cfg.allInterfaces); + + message = + let + offendingInterfaces = lib.filter ( + iface: lib.stringLength "${config.system.name}-${iface.name}" > 15 + ) (lib.attrValues cfg.allInterfaces); + offenderList = map ( + i: + "${config.system.name}-${i.name} (${toString (lib.stringLength "${config.system.name}-${i.name}")} chars)" + ) offendingInterfaces; + in + '' + The following generated host interface names exceed the Linux 15-character limit: + ${lib.concatStringsSep "\n " offenderList} + + Please shorten 'config.system.name' or the interface names in 'virtualisation.interfaces'. + ''; + } + ]; + # TODO(arianvp): Remove after https://github.com/NixOS/nixpkgs/pull/480686 is merged console.enable = true; From 3acd5cae61da04264077499a24da39141f86dc02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 9 Feb 2026 17:00:31 +0100 Subject: [PATCH 24/37] nixos/virtualisation: disallow specialisation in nspawn-container --- nixos/modules/virtualisation/nspawn-container/default.nix | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nixos/modules/virtualisation/nspawn-container/default.nix b/nixos/modules/virtualisation/nspawn-container/default.nix index 56bcb13e68bf..c62bcfeb4540 100644 --- a/nixos/modules/virtualisation/nspawn-container/default.nix +++ b/nixos/modules/virtualisation/nspawn-container/default.nix @@ -71,6 +71,14 @@ in boot.isNspawnContainer = true; assertions = [ + { + assertion = config.specialisation == { }; + message = '' + Setting 'specialisation' is disallowed for systemd-nspawn container configurations. + Activating a specialisation requires creating SUID wrappers (e.g., for 'sudo'), + which is prohibited within the Nix build sandbox where the test is run. + ''; + } { # Check every interface defined in allInterfaces. # Containers try to create a bridge "${config.system.name}-${interfaceName}" From 184674895eba0ff6929204696386424cffe1a24a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 10 Feb 2026 15:58:50 +0100 Subject: [PATCH 25/37] nixos/test-driver: remove systemd dependency if no containers are configured --- nixos/lib/test-driver/default.nix | 3 +++ nixos/lib/testing/driver.nix | 2 ++ 2 files changed, 5 insertions(+) diff --git a/nixos/lib/test-driver/default.nix b/nixos/lib/test-driver/default.nix index 13158d65de6b..72aa6b11bfe5 100644 --- a/nixos/lib/test-driver/default.nix +++ b/nixos/lib/test-driver/default.nix @@ -24,6 +24,7 @@ util-linux, vde2, + enableNspawn ? false, enableOCR ? false, extraPythonPackages ? (_: [ ]), }: @@ -55,6 +56,8 @@ buildPythonApplication { socat util-linux vde2 + ] + ++ lib.optionals enableNspawn [ systemd ] ++ lib.optionals enableOCR [ diff --git a/nixos/lib/testing/driver.nix b/nixos/lib/testing/driver.nix index 864fc9058a95..63ad51147743 100644 --- a/nixos/lib/testing/driver.nix +++ b/nixos/lib/testing/driver.nix @@ -14,6 +14,8 @@ let qemu_pkg = config.qemu.package; imagemagick_light = hostPkgs.imagemagick_light.override { inherit (hostPkgs) libtiff; }; tesseract4 = hostPkgs.tesseract4.override { enableLanguages = [ "eng" ]; }; + + enableNspawn = config.containers != { }; # We want `pkgs.systemd`, *not* `python3Packages.system`. systemd = hostPkgs.systemd; }; From 99884696cc385e8856038abf186ad367a0f76d47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 10 Feb 2026 16:33:17 +0100 Subject: [PATCH 26/37] nixos/test-driver: show all lines of container journal --- .../test-driver/src/test_driver/machine/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 664b2aae3906..6a5af7881332 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1534,7 +1534,16 @@ class NspawnMachine(BaseMachine): # Using a loop here handles cases where journalctl might exit unexpectedly while self.process.poll() is None: # While the container is still running with subprocess.Popen( - ["journalctl", "-f", "-D", journal_path, "-o", "short-monotonic"], + [ + "journalctl", + "-f", + "-D", + journal_path, + "--lines", + "all", + "-o", + "short-monotonic", + ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, From 1118384e0eeefbd918387be865fe139be8a65cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 10 Feb 2026 16:37:13 +0100 Subject: [PATCH 27/37] nixos/test-driver: use long options for journalctl call --- .../test-driver/src/test_driver/machine/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 6a5af7881332..81809149a12f 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1536,13 +1536,10 @@ class NspawnMachine(BaseMachine): with subprocess.Popen( [ "journalctl", - "-f", - "-D", - journal_path, - "--lines", - "all", - "-o", - "short-monotonic", + "--follow", + f"--directory={journal_path}", + "--lines=all", + "--output=short-monotonic", ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, From 8c8bf9b0903de9725b6505163fbfd116cb2ad639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 10 Feb 2026 16:41:50 +0100 Subject: [PATCH 28/37] nixos/test-driver: required vm_names and container_names --- .../test-driver/src/test_driver/__init__.py | 18 ++++++++---------- .../lib/test-driver/src/test_driver/driver.py | 10 ++++------ 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 733e92d894ec..0f42f2842c77 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -178,20 +178,18 @@ def main() -> None: if args.debug_hook_attach is not None: debugger = Debug(logger, args.debug_hook_attach) - if args.vm_names is not None and args.vm_start_scripts is not None: - assert len(args.vm_names) == len(args.vm_start_scripts), ( - f"the number of vm names and vm start scripts must be the same: {args.vm_names} vs. {args.vm_start_scripts}" - ) - if args.container_names is not None and args.container_start_scripts is not None: - assert len(args.container_names) == len(args.container_start_scripts), ( - f"the number of container names and container start scripts must be the same: {args.container_names} vs. {args.container_start_scripts}" - ) + assert len(args.vm_names) == len(args.vm_start_scripts), ( + f"the number of vm names and vm start scripts must be the same: {args.vm_names} vs. {args.vm_start_scripts}" + ) + assert len(args.container_names) == len(args.container_start_scripts), ( + f"the number of container names and container start scripts must be the same: {args.container_names} vs. {args.container_start_scripts}" + ) with Driver( vm_names=args.vm_names, - vm_start_scripts=args.vm_start_scripts or [], + vm_start_scripts=args.vm_start_scripts, container_names=args.container_names, - container_start_scripts=args.container_start_scripts or [], + container_start_scripts=args.container_start_scripts, vlans=args.vlans, tests=args.testscript.read_text(), out_dir=output_directory, diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index 6b01828d270c..15103ec6e234 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -79,9 +79,9 @@ class Driver: def __init__( self, - vm_names: list[str] | None, + vm_names: list[str], vm_start_scripts: list[str], - container_names: list[str] | None, + container_names: list[str], container_start_scripts: list[str], vlans: list[int], tests: str, @@ -116,9 +116,7 @@ class Driver: out_dir=self.out_dir, logger=self.logger, ) - for name, vm_start_script in zip( - vm_names or (len(vm_start_scripts) * [None]), vm_start_scripts - ) + for name, vm_start_script in zip(vm_names, vm_start_scripts) ] if len(container_start_scripts) > 0: @@ -135,7 +133,7 @@ class Driver: out_dir=self.out_dir, ) for name, container_start_script in zip( - container_names or (len(container_start_scripts) * [None]), + container_names, container_start_scripts, ) ] From c4c96a76e9a94bb6fc3aefa1dd2ab64a36bc6373 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 10 Feb 2026 17:09:42 +0100 Subject: [PATCH 29/37] nixos/test-driver: wait for READY message from nspawn container --- .../lib/test-driver/src/test_driver/driver.py | 10 ++- .../src/test_driver/machine/__init__.py | 82 ++++++++++++------- .../nspawn-container/default.nix | 3 + 3 files changed, 64 insertions(+), 31 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index 15103ec6e234..8cbcb052be49 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -333,8 +333,16 @@ class Driver: def start_all(self) -> None: """Start all machines""" with self.logger.nested("start all VMs"): + threads = [] for machine in self.machines: - machine.start() + # Create a thread for each machine's start method + t = threading.Thread(target=machine.start, name=f"start-{machine.name}") + threads.append(t) + t.start() + + # Wait for all startup threads to complete before proceeding + for t in threads: + t.join() def join_all(self) -> None: """Wait for all machines to shut down""" diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 81809149a12f..09729f58dba1 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1384,6 +1384,9 @@ class NspawnMachine(BaseMachine): process: subprocess.Popen | None pid: int | None + machine_sock_path: Path + machine_sock: socket.socket | None + @staticmethod def machine_name_from_start_command(start_command: str) -> str: match = re.search("run-(.+)-nspawn", os.path.basename(start_command)) @@ -1415,6 +1418,8 @@ class NspawnMachine(BaseMachine): self.process = None self.pid = None + self.machine_sock_path = self.tmp_dir / f"{self.name}-nspawn.sock" + def ssh_backdoor_command(self, index: int) -> str: # documented in systemd-ssh-generator(8) and https://systemd.io/CONTAINER_INTERFACE/ socket_path = f"/run/systemd/nspawn/unix-export/{self.name}/ssh" @@ -1425,6 +1430,9 @@ class NspawnMachine(BaseMachine): if self.pid is None: return + if self.machine_sock: + self.machine_sock.close() + self.logger.info(f"kill NspawnMachine (pid {self.pid})") assert self.process is not None self.process.terminate() @@ -1433,43 +1441,49 @@ class NspawnMachine(BaseMachine): def is_up(self) -> bool: return self.process is not None + def _poll_socket(self) -> tuple[bool, int | None]: + """Non-blocking check of container status via socket. + Returns (is_ready, leader_pid). + """ + assert self.machine_sock is not None + ready = False + leader_pid = None + try: + data, _ = self.machine_sock.recvfrom(4096) + msg = data.decode() + for line in msg.splitlines(): + if line == "READY=1": + ready = True + if line.startswith("X_NSPAWN_LEADER_PID="): + leader_pid = int(line.split("=")[1]) + except OSError: + pass + return ready, leader_pid + @cached_property def get_systemd_process(self) -> int: - assert self.process is not None, "Machine not started" - assert self.process.stdout is not None, "Machine has no stdout" + """Block until startup is complete and return the PID of the container's systemd process.""" + assert self.process is not None - systemd_nspawn_pid = None - for line_bytes in self.process.stdout: - line = line_bytes.decode() - self.log(line.rstrip()) + container_pid: int | None = None + is_ready = False - systemd_nspawn_pid_prefix = "systemd-nspawn's PID is " - if line.startswith(systemd_nspawn_pid_prefix): - systemd_nspawn_pid = int(line.removeprefix(systemd_nspawn_pid_prefix)) + while not is_ready or container_pid is None: + # Poll the socket until we have the container leader PID + if self.process.poll() is not None: + raise MachineError("systemd-nspawn process exited unexpectedly") - if ( - line.startswith("systemd[1]: Startup finished in") - or "Welcome to NixOS" in line - ): - assert systemd_nspawn_pid is not None, "Must find systemd-nspawn PID" - break - else: - raise RuntimeError(f"Failed to start container {self.name}") + # Poll and update our local tracking variables + ready_now, pid_now = self._poll_socket() + if ready_now: + is_ready = True + if pid_now: + container_pid = pid_now - children = ( - Path(f"/proc/{systemd_nspawn_pid}/task/{systemd_nspawn_pid}/children") - .read_text() - .split() - ) - assert len(children) == 1, ( - f"Expected exactly one child process for systemd-nspawn, got {children}" - ) - (child,) = children + if not (is_ready and container_pid): + time.sleep(0.05) - try: - return int(child) - except ValueError as e: - raise RuntimeError(f"Failed to parse child process id {child}") from e + return container_pid def _execute( self, @@ -1570,11 +1584,19 @@ class NspawnMachine(BaseMachine): if self.process is not None: return + if self.machine_sock_path is not None and self.machine_sock_path.exists(): + self.machine_sock_path.unlink() + + self.machine_sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_DGRAM) + self.machine_sock.bind(str(self.machine_sock_path)) + self.machine_sock.setblocking(False) + self.process = subprocess.Popen( [self.start_command], env={ "RUN_NSPAWN_ROOT_DIR": str(self.state_dir), "RUN_NSPAWN_SHARED_DIR": str(self.shared_dir), + "NOTIFY_SOCKET": self.machine_sock_path.as_posix(), }, stdin=subprocess.PIPE, stdout=subprocess.PIPE, diff --git a/nixos/modules/virtualisation/nspawn-container/default.nix b/nixos/modules/virtualisation/nspawn-container/default.nix index c62bcfeb4540..f56167ca0b07 100644 --- a/nixos/modules/virtualisation/nspawn-container/default.nix +++ b/nixos/modules/virtualisation/nspawn-container/default.nix @@ -133,6 +133,9 @@ in # > kind of unit allocation or registration with systemd-machined. "--keep-unit" "--register=no" + + # Send a READY=1 notification to a socket when the container is fully booted. + "--notify-ready=yes" ]; system.build.nspawn = From 2038234b230a59d86dd4ca4bb8aadf88ab48ad28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 3 Mar 2026 15:23:19 +0100 Subject: [PATCH 30/37] nixos/tests: remove firewall configuration The test merely ensures that machines can reach each other via ICMP ping. This does not require the firewall to be opened or disabled. --- nixos/tests/nixos-test-driver/containers.nix | 5 ----- 1 file changed, 5 deletions(-) diff --git a/nixos/tests/nixos-test-driver/containers.nix b/nixos/tests/nixos-test-driver/containers.nix index f80fca9f5591..7a2363b4ccb9 100644 --- a/nixos/tests/nixos-test-driver/containers.nix +++ b/nixos/tests/nixos-test-driver/containers.nix @@ -5,11 +5,9 @@ nodes = { n1 = { - networking.firewall.enable = false; virtualisation.vlans = [ 1 ]; }; n2 = { - networking.firewall.enable = false; virtualisation.vlans = [ 2 ]; @@ -18,15 +16,12 @@ containers = { c1 = { - networking.firewall.enable = false; virtualisation.vlans = [ 1 ]; }; c2 = { - networking.firewall.enable = false; virtualisation.vlans = [ 2 ]; }; c12 = { - networking.firewall.enable = false; virtualisation.vlans = [ 1 2 From e4df4c2b1dc2fcc631247f94b59ae9964fa4b9b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 3 Mar 2026 15:38:03 +0100 Subject: [PATCH 31/37] nixos/test-driver: disable DHCP in nspawn containers --- nixos/lib/testing/nodes.nix | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/nixos/lib/testing/nodes.nix b/nixos/lib/testing/nodes.nix index ceec0b8fab7f..d1f61b17394c 100644 --- a/nixos/lib/testing/nodes.nix +++ b/nixos/lib/testing/nodes.nix @@ -87,6 +87,24 @@ let # https://github.com/NixOS/nix/blob/959c244a1265f4048390f3ad21679219d7b27a99/src/libstore/unix/build/linux-derivation-builder.cc#L63 services.openssh.settings.UsePAM = false; + # Networking for tests is statically configured by default. + # dhcpcd times out after blocking for a long time, which slows down tests. + # See https://github.com/NixOS/nixpkgs/pull/478109#discussion_r2867570799 + networking.useDHCP = lib.mkDefault false; + + # Disable Info manual directory generation to prevent build failures. + # + # Context: 'install-info' (from texinfo) is triggered during system-path + # generation to index manuals, but it requires 'gzip' in the $PATH to + # decompress them. + # When 'networking.useDHCP' is set to false, transitive dependencies + # (like dhcpcd or other network tools) that normally pull 'gzip' into + # the system environment are removed. This leaves 'install-info' + # stranded without 'gzip', causing the 'system-path' derivation to fail. + # Since nspawn containers are typically minimal, disabling 'info' + # is a cleaner fix than explicitly adding 'gzip' to systemPackages. + documentation.info.enable = lib.mkDefault false; + # Gross, insecure hack to make login work. See above. security.pam.services.login = { text = '' From 3dc77ba2b11f30c3d32ced5f7266c08f838b3652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 3 Mar 2026 16:25:41 +0100 Subject: [PATCH 32/37] nixos/test-driver: exponential backoff and logging for nspawn startup Previously, the driver used a constant 50ms sleep while polling the nspawn socket. This replaces it with: 1. An exponential backoff (10ms to 500ms) to reduce CPU churn while remaining responsive. 2. Periodic logging (every 10s) to provide visibility during slow container startups. --- .../src/test_driver/machine/__init__.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 09729f58dba1..bd020818c529 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1468,11 +1468,24 @@ class NspawnMachine(BaseMachine): container_pid: int | None = None is_ready = False + start_time = time.monotonic() + last_warning = start_time + delay = 0.01 + max_delay = 0.5 + while not is_ready or container_pid is None: # Poll the socket until we have the container leader PID if self.process.poll() is not None: raise MachineError("systemd-nspawn process exited unexpectedly") + # Print periodic warnings every 10s so the user knows we aren't deadlocked + now = time.monotonic() + if now - last_warning > 10.0: + self.log( + f"still waiting for container '{self.name}' to reach ready state..." + ) + last_warning = now + # Poll and update our local tracking variables ready_now, pid_now = self._poll_socket() if ready_now: @@ -1481,7 +1494,8 @@ class NspawnMachine(BaseMachine): container_pid = pid_now if not (is_ready and container_pid): - time.sleep(0.05) + time.sleep(delay) + delay = min(delay * 2, max_delay) return container_pid From 6d466ecb8ede6cc8bdd99be85b1e4c2932ef0b04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 3 Mar 2026 16:49:54 +0100 Subject: [PATCH 33/37] nixos/test-driver: thread-safe polling of nspawn container journal Previously, on shutdown of the systemd-nspawn containers the reference to the nspawn process could become `None` which would crash the thread that streams the container's journal. We grab and hold on to a reference to the nspawn process in order to detect that process's termination gracefully. --- .../test-driver/src/test_driver/machine/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index bd020818c529..104f8939212f 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -1546,6 +1546,10 @@ class NspawnMachine(BaseMachine): assert self.process is not None, "Container not started" journal_path = self.state_dir / "var/log/journal" + # Grab a reference to the process here so we can continue polling + # the container process to see if it has exited. + proc = self.process + # 1. Wait for the directory to actually be created by the container self.log(f"Waiting for journal at {journal_path}...") max_attempts = 10 @@ -1560,7 +1564,7 @@ class NspawnMachine(BaseMachine): # 2. Start the journalctl process # Using a loop here handles cases where journalctl might exit unexpectedly - while self.process.poll() is None: # While the container is still running + while proc.poll() is None: # While the container is still running with subprocess.Popen( [ "journalctl", @@ -1581,7 +1585,7 @@ class NspawnMachine(BaseMachine): for line in iter(log_proc.stdout.readline, ""): if line: self.log_serial(line.rstrip()) - if self.process.poll() is not None: + if proc.poll() is not None: break except Exception as e: self.log(f"Error while reading journalctl output: {e}") @@ -1591,7 +1595,7 @@ class NspawnMachine(BaseMachine): # If we reach here, journalctl stopped while the container is still running. # Wait a moment before retrying to avoid CPU pegging if something is wrong. - if self.process.poll() is None: + if proc.poll() is None: time.sleep(1) def start(self) -> None: From d1386a426b5c681c5f9f35992445441fe2553915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 4 Mar 2026 08:39:53 +0100 Subject: [PATCH 34/37] nixos/virtualisation: provide actionable advice on VLAN TAP failure --- .../nspawn-container/run-nspawn/src/run_nspawn/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py b/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py index ed292912d16d..3eb622cbbfe8 100644 --- a/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py +++ b/nixos/modules/virtualisation/nspawn-container/run-nspawn/src/run_nspawn/__init__.py @@ -90,6 +90,10 @@ def ensure_vlan_bridge(vlan: int) -> typing.Generator[str, None, None]: logger.warning( f"TAP {tap_name} not found; container will be isolated from VDE" ) + if not Path("/dev/net").exists(): + logger.warning( + "A common reason for this is that /dev/net is not available in the Nix sandbox. Try adding /dev/net to extra-sandbox-paths." + ) yield bridge_name finally: From 7f6ea2a15f673f8534c25c892eaf281fcbb60eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Wed, 4 Mar 2026 16:48:57 +0100 Subject: [PATCH 35/37] nixos/test-driver: make /etc/hosts resolution VLAN-aware Filter /etc/hosts entries by shared VLANs to ensure nodes resolve to reachable interface addresses. --- nixos/lib/testing/network.nix | 53 ++++++++++++-------- nixos/tests/nixos-test-driver/containers.nix | 2 +- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/nixos/lib/testing/network.nix b/nixos/lib/testing/network.nix index ff3c57412546..fff7ffbc260f 100644 --- a/nixos/lib/testing/network.nix +++ b/nixos/lib/testing/network.nix @@ -64,29 +64,42 @@ let optionalString (ipInterfaces != [ ]) (head (head ipInterfaces).value.ipv6.addresses).address; - # Put the IP addresses of all VMs in this machine's - # /etc/hosts file. If a machine has multiple - # interfaces, use the IP address corresponding to - # the first interface (i.e. the first network in its - # virtualisation.vlans option). - networking.extraHosts = concatMapAttrsStringSep "" ( - m': config: + # Generate /etc/hosts by only including IP addresses from VLANs that + # both the local machine and the remote machine share. This prevents + # machines from trying to connect via unreachable interfaces (e.g., + # a Management VLAN) and ensures name resolution matches the + # actual network topology of the test. + networking.extraHosts = let - hostnames = - optionalString ( - config.networking.domain != null - ) "${config.networking.hostName}.${config.networking.domain} " - + "${config.networking.hostName}\n"; + localVlans = config.virtualisation.vlans; in - optionalString ( - config.networking.primaryIPAddress != "" - ) "${config.networking.primaryIPAddress} ${hostnames}" - + optionalString ( - config.networking.primaryIPv6Address != "" - ) "${config.networking.primaryIPv6Address} ${hostnames}" - ) testModuleArgs.config.allMachines; + concatMapAttrsStringSep "" ( + mName: remoteConfig: + let + remoteInterfaces = remoteConfig.networking.interfaces; + sharedIps = lib.flatten ( + lib.mapAttrsToList ( + ifaceName: ifaceCfg: + let + remoteIfaceMeta = remoteConfig.virtualisation.allInterfaces."${ifaceName}" or { }; + vlanId = remoteIfaceMeta.vlan or null; + in + if vlanId != null && builtins.elem vlanId localVlans then + builtins.map (addr: addr.address) ifaceCfg.ipv4.addresses + ++ builtins.map (addr: addr.address) ifaceCfg.ipv6.addresses + else + [ ] + ) remoteInterfaces + ); + hostnames = + optionalString ( + remoteConfig.networking.domain != null + ) "${remoteConfig.networking.hostName}.${remoteConfig.networking.domain} " + + "${remoteConfig.networking.hostName}\n"; + in + builtins.concatStringsSep "" (map (ip: "${ip} ${hostnames}") sharedIps) + ) testModuleArgs.config.allMachines; }; - in { key = "network-interfaces"; diff --git a/nixos/tests/nixos-test-driver/containers.nix b/nixos/tests/nixos-test-driver/containers.nix index 7a2363b4ccb9..073e2de28bd6 100644 --- a/nixos/tests/nixos-test-driver/containers.nix +++ b/nixos/tests/nixos-test-driver/containers.nix @@ -49,7 +49,7 @@ c12.succeed("ping -c 1 c1") # Confirm containers in vlan 2 can talk to each other. - # <<< c2.succeed("ping -c 1 c12") # <<< TODO: this doesn't work because c12's "primary ip" is for vlan 1 + c2.succeed("ping -c 1 c12") c12.succeed("ping -c 1 c2") # Confirm containers in separate vlans cannot talk to each other. From 8615a25cbece3dcfab393637f7c25b2f3239afb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 10 Mar 2026 16:10:26 +0100 Subject: [PATCH 36/37] nixos/test-driver: make /etc/hosts VLAN-aware with primary IP fallback Resolves an issue where nodes on shared secondary VLANs could not reach each other if their primary IPs were on isolated networks. --- nixos/lib/testing/network.nix | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/nixos/lib/testing/network.nix b/nixos/lib/testing/network.nix index fff7ffbc260f..570b854d8e28 100644 --- a/nixos/lib/testing/network.nix +++ b/nixos/lib/testing/network.nix @@ -64,11 +64,9 @@ let optionalString (ipInterfaces != [ ]) (head (head ipInterfaces).value.ipv6.addresses).address; - # Generate /etc/hosts by only including IP addresses from VLANs that - # both the local machine and the remote machine share. This prevents - # machines from trying to connect via unreachable interfaces (e.g., - # a Management VLAN) and ensures name resolution matches the - # actual network topology of the test. + # Generate /etc/hosts including every remote's primary IP addresses + # (whichever VLAN they may belong to) as well as all IP addresses from + # VLANs that both the local machine and the remote machine share. networking.extraHosts = let localVlans = config.virtualisation.vlans; @@ -91,13 +89,24 @@ let [ ] ) remoteInterfaces ); + + # We also want to test router protocols that enable connections + # between nodes even if they don't share a VLAN, so we include + # the primary IPs of all machines in the hosts file. + primaryIPs = [ + remoteConfig.networking.primaryIPAddress + remoteConfig.networking.primaryIPv6Address + ]; + + allReachableIps = lib.lists.uniqueStrings (sharedIps ++ primaryIPs); + hostnames = optionalString ( remoteConfig.networking.domain != null ) "${remoteConfig.networking.hostName}.${remoteConfig.networking.domain} " + "${remoteConfig.networking.hostName}\n"; in - builtins.concatStringsSep "" (map (ip: "${ip} ${hostnames}") sharedIps) + builtins.concatStringsSep "" (map (ip: "${ip} ${hostnames}") allReachableIps) ) testModuleArgs.config.allMachines; }; in From 01d5f7a0d9663bce31d21f1c3944ba161a70d303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Mon, 16 Mar 2026 13:35:19 +0100 Subject: [PATCH 37/37] nixos/test-driver: pass containers to tests Co-authored-by: cinereal --- nixos/lib/testing/nodes.nix | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nixos/lib/testing/nodes.nix b/nixos/lib/testing/nodes.nix index d1f61b17394c..795838e77983 100644 --- a/nixos/lib/testing/nodes.nix +++ b/nixos/lib/testing/nodes.nix @@ -45,7 +45,10 @@ let ./nixos-test-base.nix { key = "nodes"; - _module.args.nodes = config.nodesCompat; + _module.args = { + inherit (config) containers; + nodes = config.nodesCompat; + }; } ( { options, ... }: