monitor.sh 691 Bytes
Newer Older
xuchen committed
1 2
gpu_num=4
cmd="sh train.sh"
xuchen committed
3 4 5

while :
do
xuchen committed
6 7 8 9
    record=$(mktemp -t temp.record.XXXXXX)
    gpustat > $record
    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");

xuchen committed
10 11 12
    count=0
    for dev in ${all_devices[@]}
    do
xuchen committed
13 14 15 16
        line=$((dev + 2))
        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)

        if [[ $use -lt 100 ]]; then
xuchen committed
17
            device[$count]=$dev
xuchen committed
18
            count=$((count + 1))
xuchen committed
19 20 21 22 23 24 25 26 27 28 29 30 31 32
            if [[ $count -eq $gpu_num ]]; then
                break
            fi
        fi
    done
    if [[ ${#device[@]} -lt $gpu_num ]]; then
        sleep 60s
    else
        echo "Run $cmd"
        eval $cmd
        sleep 10s
        exit
    fi
done