#!/bin/sh  
# \
    exec oagtclsh "$0" "$@"

set auto_path [linsert $auto_path 0  /usr/local/oag/apps/lib/$env(HOST_ARCH)]
set auto_path [linsert $auto_path 0 /usr/local/oag/lib_patch/$env(HOST_ARCH)]
APSStandardSetup

if {[info exists env(CDELDEBUG)]} {
    set debug 1
    puts "debug mode"
} else {
    set debug 0
}
if {[info exists env(CDELNOOP)]} {
    set noop 1
    puts "noop mode"
} else {
    set noop 0
}

proc rshcommand {args} {
    set host ""
    set command ""
    APSStrictParseArguments {host command}
    if {[catch {eval exec /usr/bin/rsh $host $command} results]} {
	return
    }
    if {[string length $results] != 0} {
	return $results
    }
}
proc rshcommandnoop {args} {
    set host ""
    set command ""
    APSStrictParseArguments {host command}
    puts "/usr/bin/rsh $host $command"
}

proc DeleteSGEJobs {jobIDList} {
    global qstatresults debug noop

    set jobIDdeleteList ""
    set maxslots 1
    foreach jobID $jobIDList {

	set host ""
	set oneslot 0
	set queued 0
	foreach line $qstatresults {
	    if {[lindex $line 0] != $jobID} {
		continue
	    }
	    set host [lindex [split [lindex [split [lindex $line end-1] @] 1] .] 0]
	    set slots [lindex $line end]
	    if {$slots > $maxslots} {
		set maxslots $slots
	    }
	    set state [lindex $line 4]
	    if {($state == "qw") || ($state == "Eqw")} {
		puts "Found queued job $jobID in GridEngine queue"
		lappend jobIDdeleteList $jobID
		set queued 1
		break
	    }
	    if {$slots == "1"} {
		set oneslot 1
	    }
	}
	if {$queued} {
	    continue
	}
	if {![llength $host]} {
	    puts "Warning: job $jobID is not known by GridEngine"
	    continue
	}
	puts "Locating MPI tasks related to job $jobID"
	set results [rshcommand -host $host -command "ps -fe | fgrep sge_shepherd-$jobID | fgrep -v fgrep | fgrep -v cdel"]
	if {![llength $results]} {
	    if {$oneslot == "1"} {
		puts "Found serial job $jobID in GridEngine queue"
		lappend jobIDdeleteList $jobID
		continue
	    }
	    puts "Warning: unable to delete job $jobID"
	    continue
	}
        if {$oneslot == "1"} {
            puts "Found serial job $jobID in GridEngine queue"
            lappend jobIDdeleteList $jobID
            continue
        }
	set pid(0) [lindex $results 1]
	set ppid(0) ""
	set pid(1) ""
	set pid(2) ""
	set pid(3) ""
	set pid(4) ""
	set pid(5) ""
	set pid(6) ""
	set pidList ""
	set ppidList ""
	set commandList ""

	set results [rshcommand -host $host -command "ps -e -o pid,ppid,command"]
	set results [split $results \n]
	foreach line [lrange $results 1 end] {
            if {[catch {set junkvar [lindex $line 0]} results]} {
                continue
            } else {
                lappend pidList [lindex $line 0]
                lappend ppidList [lindex $line 1]
                lappend commandList [lrange $line 2 end]
            }
	}
	
	set m 1
	for {set n 0} {$n < 6} {incr n} {
	    foreach p $pidList pp $ppidList {
		if {[lsearch -exact $pid($n) $pp] != -1} {
		    lappend pid($m) $p
		}
	    }
	    incr m
	}


	set numList ""
	set LAM 0
	set openmpi 0
	set mpich2smpd 0
	set mvapich2 0
	set mvapich 0
        set mpich3 0
	unset -nocomplain num
	if {($pid(3) == "")} {
	    #check to see if this is a LAM/MPI job
	    foreach p $pidList pp $ppidList c $commandList {
		if {($p == $pid(2))} {
		    if {([lindex $c 0] == "/share/lam-7.1.2/bin/mpirun")} {
			set LAM 1
		    } elseif {([lindex $c 0] == "/disk1/mpich2_smpd_nemesis/bin/mpiexec")} {
			set mpich2smpd 1
		    }
		}
	    }
	}

	
	foreach p $pidList pp $ppidList c $commandList {
	    if {($p == $pid(2))} {
		if {([lindex $c 0] == "/usr/local/software/openmpi-1.2.4/bin/mpirun")} {
		    set openmpi 1
		    puts "Found openmpi job $jobID in GridEngine queue"
		    lappend jobIDdeleteList $jobID
		    #if {!$noop} {
                    #exec /act/sge/bin/lx24-amd64/qdel $jobID
		    #}
		    break
		} elseif {([lindex $c 0] == "/act/mvapich2/gnu/bin/mpirun_rsh")} {
		    set mvapich2 1
		} elseif {([lindex $c 1] == "/act/mvapich-1.2rc1-gcc-g95/bin/mpirun")} {
		    set mvapich 1
		} elseif {([lindex $c 0] == "/lustre/3rdPartySoftware/mvapich2-1.9b/bin/mpirun_rsh")} {
		    set mvapich2 1 
		} elseif {([lindex $c 0] == "/lustre/3rdPartySoftware/mvapich2-1.9rc1/bin/mpirun_rsh")} {
		    set mvapich2 1 
		} elseif {([lindex $c 0] == "/lustre/3rdPartySoftware/mvapich2-2.2/bin/mpirun_rsh")} {
		    set mvapich2 1 
		} elseif {([lindex $c 0] == "/lustre/3rdPartySoftware/mvapich2-2.3b/bin/mpirun_rsh")} {
		    set mvapich2 1 
		} elseif {([lindex $c 0] == "/lustre/3rdPartySoftware/mvapich2-2.3.2/bin/mpirun_rsh")} {
		    set mvapich2 1 
		} elseif {([lindex $c 0] == "/lustre/3rdPartySoftware/mpich-3.3.2/bin/mpirun")} {
		    set mpich3 1 
		} else {
		    puts "UNKNOWN [lindex $c 0]"
		}
	    }
	}
	if {$openmpi} {
	    continue
	}
	
	if {$LAM} {
	    puts "LAM/MPI job detected"
	    set deamon lamd
	    foreach p $pidList pp $ppidList c $commandList {
		if {([lindex $c end] == "sge-${jobID}-undefined")} {
		    set i [lsearch -exact $c "-P"]
		    if {$i != -1} {
			incr i
			lappend numList [lindex $c $i]
			set num([lindex $c $i]) 0
		    }
		}
	    }
	} elseif {$mpich2smpd} {
	    puts "mpich2_smpd_nemesis job detected"
	    set deamon /share/mpich2_smpd_nemesis/bin/smpd
	    foreach p $pidList pp $ppidList c $commandList {
		if {($p == $pid(2))} {
		    if {([lindex $c 0] == "/disk1/mpich2_smpd_nemesis/bin/mpiexec")} {
			lappend numList [lindex $c 4]
			set num([lindex $c 4]) 0
		    }
		}
	    }
	} elseif {$mvapich2} {
	    puts "mvapich2 job detected"
	    set deamon in.rshd
	    foreach p $pidList pp $ppidList c $commandList {
		if {($pp == $pid(2))} {
		    foreach ele $c {
			if {[string range $ele 0 20] == "MPISPAWN_MPIRUN_PORT="} {
			    set e [string range $ele 21 end]
			    lappend numList $e
			    set num($e) 0
			}
		    }
		}
	    }
	} elseif {$mvapich} {
	    puts "mvapich job detected"
	    set deamon in.rshd
	    foreach p $pidList pp $ppidList c $commandList {
		if {($pp != $pid(5)) && ($pp != $pid(3))} {
		    continue
		}
		foreach ele $c {
		    if {[string is integer $ele]} {
			lappend numList $ele
			set num($ele) 0
		    }
		}
	    }
	} elseif {$mpich3} {
	    puts "mpich3 job detected"
	    #set deamon qrsh_starter
	    set deamon hydra_pmi_proxy
	    foreach p $pidList pp $ppidList c $commandList {
		if {($pp != $pid(2))} {
		    continue
		}
                set i [lsearch -exact $c "--control-port"]
                incr i
                set ele [lindex [split [lindex $c $i] ":"] 1]
                lappend numList $ele
                set num($ele) 0
	    }
	} else {
	    set deamon in.rshd
	    foreach p $pidList pp $ppidList c $commandList {
		if {($pp != $pid(5)) && ($pp != $pid(3))} {
		    continue
		}
		foreach ele $c {
		    if {[string is integer $ele]} {
			lappend numList $ele
			set num($ele) 0
		    } elseif {[string range $ele 0 11] == "MPIRUN_PORT="} {
			set e [string range $ele 12 end]
			lappend numList $e
			set num($e) 0
		    }
		}
	    }
	}
	foreach n $numList {
	    incr num($n)
	}
	set max 0
	set tcpPort ""
	foreach n [array names num] {
	    if {$num($n) > $max} {
		set tcpPort $n
		set max $num($n)
	    }
	}
	if {($mpich2smpd)} {
	    set results [rshcommand -host $host -command "ps -fe | fgrep $deamon | fgrep $tcpPort | fgrep -v fgrep | fgrep -v cdel"]
	} elseif {($mpich3)} {
	    set results [rshcommand -host $host -command "ps -fe | fgrep $deamon | fgrep $tcpPort | fgrep -v fgrep | fgrep -v cdel | fgrep -v qrsh"]
	    #set results [rshcommand -host $host -command "ps -fe | fgrep $deamon | fgrep -v fgrep | fgrep -v cdel"]
	} else {
	    set results [rshcommand -host $host -command "ps -fe | fgrep $deamon | fgrep -v fgrep | fgrep -v cdel"]
	}
	if {![llength $results]} {
	    continue
	}
	set results [split $results \n]
	foreach line $results {
	    if ${LAM} {
		if {[lsearch -exact $line $tcpPort] != -1} {
		    lappend rshd(0) [lindex $line 1]
		}
	    } else {
		lappend rshd(0) [lindex $line 1]
	    }
	}
        if {($mpich3)} {
            set results [rshcommand -host $host -command "pidtree $rshd(0)"]
            set kill($host) [join [split $results \n]]
        } else {
            set rshd(1) ""
            set rshd(2) ""
            set rshd(3) ""
            set rshd(4) ""
            set m 1
            for {set n 0} {$n < 1} {incr n} {
                foreach p $pidList pp $ppidList c $commandList {
                    if {[lsearch -exact $rshd($n) $pp] != -1} {
                        if {[lsearch -exact $c $tcpPort] != -1} {
                            lappend rshd($m) $p
                        } elseif {[lsearch -exact $c "MPIRUN_PORT=$tcpPort"] != -1} {
                            lappend rshd($m) $p
                        } elseif {[lsearch -exact $c "MPISPAWN_MPIRUN_PORT=$tcpPort"] != -1} {
                            lappend rshd($m) $p
                        } elseif {$LAM} {
                            lappend rshd($m) $p
                        }
                    }
                }
                incr m
            }
            for {set n 1} {$n < 4} {incr n} {
                foreach p $pidList pp $ppidList c $commandList {
                    if {[lsearch -exact $rshd($n) $pp] != -1} {
                        lappend rshd($m) $p
                    }
                }
                incr m
            }
            if {$mpich2smpd} {
                append kill($host) "$pid(6) $pid(5) $pid(4) $pid(3) $pid(2) $pid(1) $rshd(4) $rshd(3) $rshd(2) $rshd(1) $rshd(0) "
            } else {
                append kill($host) "$pid(6) $pid(5) $pid(4) $pid(3) $pid(2) $pid(1) $rshd(4) $rshd(3) $rshd(2) $rshd(1) "
            }
        }
	set results [split [exec qstat -f -u *] \n]

	set slaveList ""
	foreach line $results {
	    if {[lindex $line 0] == $jobID} {
		lappend slaveList [lindex [split $ele @] 1]
	    }
	    if {[llength [split [lindex $line 0] @]] == 2} {
		set ele [lindex $line 0]
	    }
	}

	set slaveList [lsort -unique $slaveList]
	foreach slave $slaveList {
	    if {$slave == $host} {
		continue
	    }
	    set pidList ""
	    set ppidList ""
	    set commandList ""
	    set results [rshcommand -host $slave -command "ps -e -o pid,ppid,command"]
	    set results [split $results \n]
	    foreach line [lrange $results 1 end] {
		if {[catch {set junkvar [lindex $line 0]} results]} {
		    continue
		} else {
		    lappend pidList [lindex $line 0]
		    lappend ppidList [lindex $line 1]
		    lappend commandList [lrange $line 2 end]
		}
	    }

	    if {$mpich2smpd} {
		set results [rshcommand -host $slave -command "ps -fe | fgrep $deamon | fgrep $tcpPort | fgrep -v fgrep | fgrep -v cdel"]
            } elseif {($mpich3)} {
                set results [rshcommand -host $slave -command "ps -fe | fgrep $deamon | fgrep $tcpPort | fgrep -v fgrep | fgrep -v cdel | fgrep -v qrsh"]
	    } else {
		set results [rshcommand -host $slave -command "ps -fe | fgrep $deamon | fgrep -v fgrep | fgrep -v cdel"]
	    }
	    if {![llength $results]} {
		continue
	    }
	    set results [split $results \n]
	    set rshd(0) ""
	    set rshd(1) ""
	    set rshd(2) ""
	    set rshd(3) ""
	    set rshd(4) ""
	    foreach line $results {
		if ${LAM} {
		    if {[lsearch -exact $line $tcpPort] != -1} {
			lappend rshd(0) [lindex $line 1]
		    }
		} else {
		    lappend rshd(0) [lindex $line 1]
		}
	    }
            if {($mpich3)} {
                set results [rshcommand -host $slave -command "pidtree $rshd(0)"]
                set kill($slave) [join [split $results \n]]
            } else {
                set m 1
                for {set n 0} {$n < 1} {incr n} {
                    foreach p $pidList pp $ppidList c $commandList {
                        if {[lsearch -exact $rshd($n) $pp] != -1} {
                            if {[lsearch -exact $c $tcpPort] != -1} {
                                lappend rshd($m) $p
                            } elseif {[lsearch -exact $c "MPIRUN_PORT=$tcpPort"] != -1} {
                                lappend rshd($m) $p
                            } elseif {[lsearch -exact $c "MPISPAWN_MPIRUN_PORT=$tcpPort"] != -1} {
                                lappend rshd($m) $p
                            } elseif {$LAM} {
                                lappend rshd($m) $p
                            }
                        }
                    }
                    incr m
                }
                
                for {set n 1} {$n < 4} {incr n} {
                    foreach p $pidList pp $ppidList c $commandList {
                        if {[lsearch -exact $rshd($n) $pp] != -1} {
                            lappend rshd($m) $p
                        }
                    }
                    incr m
                }
                if {$mpich2smpd} {
                    append kill($slave) "$rshd(4) $rshd(3) $rshd(2) $rshd(1) $rshd(0) "
                } else {
                    append kill($slave) "$rshd(4) $rshd(3) $rshd(2) $rshd(1) "
                }
            }
	}


	lappend jobIDdeleteList $jobID
    }

    puts "Deleting job(s) $jobIDdeleteList from GridEngine"
    if {!$noop} {
	set fid [open /lustre/oagsoftware/logs/cdel.log a]
	puts $fid "[clock format [clock seconds]]"
	puts $fid "/act/sge/bin/lx24-amd64/qdel $jobIDdeleteList"
	close $fid
	catch {eval exec /act/sge/bin/lx24-amd64/qdel $jobIDdeleteList}
    }

    puts "Pause [expr {round((($maxslots / 20.0) + 5))}] seconds"
    after [expr {round((($maxslots / 20.0) + 5) * 1000)}]

    puts "Deleting leftover MPI tasks with the kill -9 command"
    foreach node [lsort [array names kill]] {
	set k [string trim $kill($node)]
	if {[string length $k] > -10} {
	    if {$debug} {
		puts "rshcommand -host $node -command \"kill -9 $k\""
	    }
	    if {!$noop} {
		set results [eval rshcommand -host $node -command \"kill -9 $k\"]
	    }
	}
    }
}


set usage "usage: cdel \[-u user\] \[<job-ID> <job-ID> <...>\]"

if {[lindex $argv 0] == "-u"} {
    if {[catch {exec qstat -u [lindex $argv 1]} qstatresults]} {
	puts "error: $qstatresults"
	exit
    }
    set qstatresults [split $qstatresults \n]
    set jobIDList ""
    foreach line [lrange $qstatresults 2 end] {
	lappend jobIDList [lindex $line 0]
    }
    if {![llength $jobIDList]} {
	puts "No jobs for [lindex $argv 1]"
	exit
    }
} else {
    set jobIDList $argv
    if {![llength $jobIDList]} {
	puts $usage
	exit
    }
}
if {[catch {exec qstat -u *} qstatresults]} {
    puts "error: $qstatresults"
    exit
}
set qstatresults [split $qstatresults \n]
DeleteSGEJobs $jobIDList
