From 80d9b9cc39ab85e505aa7ebc8f0c9a85633d1b53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edward=20Tj=C3=B6rnhammar?= Date: Sat, 14 Mar 2026 17:20:37 +0100 Subject: [PATCH] nixosTests.slurm: test cluster and job state --- nixos/tests/slurm.nix | 80 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index b468e32e45e0..06b622739852 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -148,6 +148,8 @@ in }; testScript = '' + start_all() + with subtest("can_start_slurmdbd"): dbd.wait_for_unit("slurmdbd.service") dbd.wait_for_open_port(6819) @@ -155,36 +157,82 @@ in with subtest("cluster_is_initialized"): control.wait_for_unit("multi-user.target") control.wait_for_unit("slurmctld.service") - control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default") + control.wait_for_open_port(6817) - start_all() - - with subtest("can_start_slurmd"): for node in [node1, node2, node3]: - node.wait_for_unit("slurmd") + node.wait_for_unit("slurmd.service") + node.wait_for_open_port(6818) - # Test that the cluster works and can distribute jobs; - submit.wait_for_unit("multi-user.target") + submit.wait_for_unit("multi-user.target") + + control.wait_until_succeeds( + "sacctmgr -nP list cluster format=cluster | grep -qx default" + ) + + # Test that the cluster works and can distribute jobs; + control.wait_until_succeeds( + "sinfo -Nh -o '%N %T' | grep -Fx 'node1 idle' && " + "sinfo -Nh -o '%N %T' | grep -Fx 'node2 idle' && " + "sinfo -Nh -o '%N %T' | grep -Fx 'node3 idle'" + ) with subtest("run_distributed_command"): # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names - submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") + submit.succeed( + "test \"$(srun -J distributed-hostname-check -N 3 hostname | sort -u | tr '\n' ' ')\" = 'node1 node2 node3 '" + ) - with subtest("check_slurm_dbd_job"): - # find the srun job from above in the database - control.wait_until_succeeds("sacct | grep hostname") + with subtest("check_slurm_dbd_job_for_srun"): + # find the srun job from above in the database + submit.wait_until_succeeds( + "sacct -X -P -n --name=distributed-hostname-check -o JobName,State | " + "grep -Eq '^distributed-hostname-check\\|COMPLETED(\\+.*)?$'" + ) with subtest("run_PMIx_mpitest"): - submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") + submit.succeed( + "out=$(srun -N 3 --mpi=pmix mpitest); " + "echo \"$out\"; " + "echo \"$out\" | grep -Fx 'size=3'; " + "test \"$(echo \"$out\" | grep -c 'hello world from process')\" -eq 3" + ) with subtest("run_sbatch"): - submit.succeed("sbatch --wait ${sbatchScript}") - submit.succeed("grep 'sbatch success' ${sbatchOutput}") + submit.succeed( + "jobid=$(sbatch --parsable --wait ${sbatchScript}); " + "echo \"$jobid\" > /tmp/sbatch.jobid" + ) + submit.succeed("grep -Fx 'sbatch success' ${sbatchOutput}") + submit.wait_until_succeeds( + "sacct -X -j $(cat /tmp/sbatch.jobid) -n -o State | grep -Eq 'COMPLETED|COMPLETED\\+'" + ) + submit.succeed("test -z \"$(squeue -h)\"") + + with subtest("cluster_returns_to_idle"): + control.wait_until_succeeds( + "sinfo -Nh -o '%N %T' | grep -Fx 'node1 idle' && " + "sinfo -Nh -o '%N %T' | grep -Fx 'node2 idle' && " + "sinfo -Nh -o '%N %T' | grep -Fx 'node3 idle'" + ) with subtest("rest"): rest.wait_for_unit("slurmrestd.service") - token = control.succeed("scontrol token").split('=')[1].rstrip() - rest.succeed("${pkgs.curl}/bin/curl -sk -H X-SLURM-USER-TOKEN:%s -X GET 'http://localhost:6820/slurm/v0.0.43/diag'" % token) + rest.wait_for_open_port(6820) + + token = control.succeed("scontrol token").split('=', 1)[1].strip() + + rest.succeed( + "${pkgs.curl}/bin/curl -fsS " + "-H X-SLURM-USER-TOKEN:%s " + "http://localhost:6820/slurm/v0.0.43/diag | grep -q 'meta'" % token + ) + + with subtest("rest_rejects_invalid_token"): + rest.fail( + "${pkgs.curl}/bin/curl -fsS " + "-H X-SLURM-USER-TOKEN:not-a-real-token " + "http://localhost:6820/slurm/v0.0.43/diag" + ) ''; }