Skip to content

Commit

Permalink
Merge pull request #398 from tmaeno/master
Browse files Browse the repository at this point in the history
timeout for build or non-JEDI jobs stalled in defined state
  • Loading branch information
tmaeno authored Aug 22, 2024
2 parents cf6d7d3 + 88443ca commit 623b389
Showing 1 changed file with 23 additions and 1 deletion.
24 changes: 23 additions & 1 deletion pandaserver/daemons/scripts/copyArchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _memoryCheck(str):
statsPerShare.setdefault(gshare, {"nq": 0, "nr": 0})
statsPerPQ.setdefault(computingSite, {})
statsPerPQ[computingSite].setdefault(gshare, {"nq": 0, "nr": 0})
if jobStatus in ["definied", "assigned", "activated", "starting"]:
if jobStatus in ["defined", "assigned", "activated", "starting"]:
statsPerPQ[computingSite][gshare]["nq"] += nJobs
statsPerShare[gshare]["nq"] += nJobs
elif jobStatus == "running":
Expand Down Expand Up @@ -729,6 +729,28 @@ def _memoryCheck(str):
Client.killJobs(jediJobs[iJob : iJob + nJob], 51, keepUnmerged=True)
iJob += nJob

# reassign stalled defined build and non-JEDI jobs
timeLimit = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - datetime.timedelta(minutes=timeoutValue)
varMap = {}
varMap[":jobStatus"] = "defined"
varMap[":prodSourceLabel_p"] = "panda"
varMap[":prodSourceLabel_u"] = "user"
varMap[":timeLimit"] = timeLimit
varMap[":lockedBy"] = "jedi"
status, res = taskBuffer.querySQLS(
"SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE ((prodSourceLabel=:prodSourceLabel_p AND transformation LIKE '%build%') OR "
"(prodSourceLabel=:prodSourceLabel_u AND lockedBy<>:lockedBy)) AND jobStatus=:jobStatus AND creationTime<:timeLimit ORDER BY PandaID",
varMap,
)
jobs = []
if res is not None:
for (id,) in res:
jobs.append(id)
# kill
if len(jobs):
Client.killJobs(jobs, 2)
_logger.debug(f"reassign stalled defined build and non-JEDI jobs with timeout {timeoutValue}min ({str(jobs)})")

# reassign long-waiting jobs in defined table
timeLimit = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - datetime.timedelta(hours=12)
status, res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4", timeLimit, [], ["managed"], [], [], [], True)
Expand Down

0 comments on commit 623b389

Please sign in to comment.