nixosTests.slurm: test cluster and job state

This commit is contained in:
Edward Tjörnhammar
2026-03-14 17:20:37 +01:00
parent 1680dd9de5
commit 80d9b9cc39

View File

@@ -148,6 +148,8 @@ in
};
testScript = ''
start_all()
with subtest("can_start_slurmdbd"):
dbd.wait_for_unit("slurmdbd.service")
dbd.wait_for_open_port(6819)
@@ -155,36 +157,82 @@ in
with subtest("cluster_is_initialized"):
control.wait_for_unit("multi-user.target")
control.wait_for_unit("slurmctld.service")
control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default")
control.wait_for_open_port(6817)
start_all()
with subtest("can_start_slurmd"):
for node in [node1, node2, node3]:
node.wait_for_unit("slurmd")
node.wait_for_unit("slurmd.service")
node.wait_for_open_port(6818)
# Test that the cluster works and can distribute jobs;
submit.wait_for_unit("multi-user.target")
submit.wait_for_unit("multi-user.target")
control.wait_until_succeeds(
"sacctmgr -nP list cluster format=cluster | grep -qx default"
)
# Test that the cluster works and can distribute jobs;
control.wait_until_succeeds(
"sinfo -Nh -o '%N %T' | grep -Fx 'node1 idle' && "
"sinfo -Nh -o '%N %T' | grep -Fx 'node2 idle' && "
"sinfo -Nh -o '%N %T' | grep -Fx 'node3 idle'"
)
with subtest("run_distributed_command"):
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
submit.succeed(
"test \"$(srun -J distributed-hostname-check -N 3 hostname | sort -u | tr '\n' ' ')\" = 'node1 node2 node3 '"
)
with subtest("check_slurm_dbd_job"):
# find the srun job from above in the database
control.wait_until_succeeds("sacct | grep hostname")
with subtest("check_slurm_dbd_job_for_srun"):
# find the srun job from above in the database
submit.wait_until_succeeds(
"sacct -X -P -n --name=distributed-hostname-check -o JobName,State | "
"grep -Eq '^distributed-hostname-check\\|COMPLETED(\\+.*)?$'"
)
with subtest("run_PMIx_mpitest"):
submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
submit.succeed(
"out=$(srun -N 3 --mpi=pmix mpitest); "
"echo \"$out\"; "
"echo \"$out\" | grep -Fx 'size=3'; "
"test \"$(echo \"$out\" | grep -c 'hello world from process')\" -eq 3"
)
with subtest("run_sbatch"):
submit.succeed("sbatch --wait ${sbatchScript}")
submit.succeed("grep 'sbatch success' ${sbatchOutput}")
submit.succeed(
"jobid=$(sbatch --parsable --wait ${sbatchScript}); "
"echo \"$jobid\" > /tmp/sbatch.jobid"
)
submit.succeed("grep -Fx 'sbatch success' ${sbatchOutput}")
submit.wait_until_succeeds(
"sacct -X -j $(cat /tmp/sbatch.jobid) -n -o State | grep -Eq 'COMPLETED|COMPLETED\\+'"
)
submit.succeed("test -z \"$(squeue -h)\"")
with subtest("cluster_returns_to_idle"):
control.wait_until_succeeds(
"sinfo -Nh -o '%N %T' | grep -Fx 'node1 idle' && "
"sinfo -Nh -o '%N %T' | grep -Fx 'node2 idle' && "
"sinfo -Nh -o '%N %T' | grep -Fx 'node3 idle'"
)
with subtest("rest"):
rest.wait_for_unit("slurmrestd.service")
token = control.succeed("scontrol token").split('=')[1].rstrip()
rest.succeed("${pkgs.curl}/bin/curl -sk -H X-SLURM-USER-TOKEN:%s -X GET 'http://localhost:6820/slurm/v0.0.43/diag'" % token)
rest.wait_for_open_port(6820)
token = control.succeed("scontrol token").split('=', 1)[1].strip()
rest.succeed(
"${pkgs.curl}/bin/curl -fsS "
"-H X-SLURM-USER-TOKEN:%s "
"http://localhost:6820/slurm/v0.0.43/diag | grep -q 'meta'" % token
)
with subtest("rest_rejects_invalid_token"):
rest.fail(
"${pkgs.curl}/bin/curl -fsS "
"-H X-SLURM-USER-TOKEN:not-a-real-token "
"http://localhost:6820/slurm/v0.0.43/diag"
)
'';
}