#!/usr/bin/perl
use strict;

########## 程序使用说明 ##########
# 本程序可以由无密码sudo的用户执行，或root用户直接执行。
# 本程序直接运行后，即可维持SGE的qmaster和execd服务。能自动识别qmaster主机和execd主机。
# 推荐使用crontab -e来例行运行本程序，实现永久生效。

# 若程序检测不到 /opt/sge 目录，则直接退出
exit unless -e "/opt/sge";

# 先删除 /tmp/keep_SGE.log 文件
my $cmdString = "sudo /bin/rm -rf /tmp/keep_SGE.log";
print "CMD: $cmdString\n";
system($cmdString);

# 检查SGE主控主机
my $qmaster_host;
if ( -e "/opt/sge/default/common/act_qmaster" ) {
    open IN, "/opt/sge/default/common/act_qmaster" or die "Can not open file /opt/sge/default/common/act_qmaster, $!";
    $qmaster_host = <IN>; chomp($qmaster_host);
}
# 检查SGE执行主机
my %execd_host;
my $cmdString_out = `/bin/ls /opt/sge/default/spool/`;
foreach ( split /\s+/, $cmdString_out ) {
    $execd_host{$_} = 1 if $_ ne "qmaster";
}
# 检查当前机器的主机名
my $hostname = `/usr/bin/hostname`; chomp($hostname);

# 若主机是主控主机时
if ( $hostname eq $qmaster_host ) {
    # 检查 sgemaster 程序的运行状态
    $cmdString = "sudo /opt/sge/default/common/sgemaster status &> /tmp/keep_SGE.log &";
    print "CMD: $cmdString\n";
    system($cmdString);

    # 若程序运行时间超过 3 秒，则强行终止它
    sleep 3;
    $cmdString = "/usr/bin/ps -aux | grep -P \"sgemaster status\" | grep -v grep";
    print "CMD: $cmdString\n";
    my $grep = `$cmdString`;
    if ( $grep ) {
        my @lines = split /\n/, $grep;
        foreach (@lines) {
            my @field = split /\s+/, $_;
            $cmdString = "sudo kill -s 9 $field[1]";
            system($cmdString);
            print "CMD: $cmdString\n";
        }
    }

    # 若 qmaster 没有运行，则重启服务
    open LOG, "/tmp/keep_SGE.log" or die "Can not open file /tmp/keep_SGE.log, $!";
    my $status = join "", <LOG>;
    print $status;
    if ( $status =~ m/qmaster.*is not running/ ) {
        my $cmdString = "sudo /opt/sge/default/common/sgemaster restart";
        system($cmdString) == 0 or die "Failed to execute: $cmdString\n";
    }
}

# 若主机是执行主机时
if ( exists $execd_host{$hostname} ) {
    # 若存在pid文件，则检查其程序是否在运行，否则，重启计算服务。
    my $if_reboot_sgeexecd = 0;
    if ( -e "/opt/sge/default/spool/$hostname/execd.pid" ) {
        open IN, "/opt/sge/default/spool/$hostname/execd.pid" or die "Can not open file /opt/sge/default/spool/$hostname/execd.pid, $!";
        my $pid = <IN>; chomp($pid);
        # 检查pid对应的程序是否在运行
        $cmdString = "/opt/sge/utilbin/lx-amd64/checkprog $pid sge_execd";
        print "CMD: $cmdString\n";
        my $status = `$cmdString`;
        print $status;
        $if_reboot_sgeexecd = 1 if $status =~ m/is not running/;
        # 示例如下：
        # /opt/sge/utilbin/lx-amd64/checkprog 1588693 sge_execd
        # pid "1588693" is not running or has another program name than "sge_execd"
        # /opt/sge/utilbin/lx-amd64/checkprog 1688470 sge_execd
        # pid "1688470" with process name "sge_execd" is running
    }
    else {
        $if_reboot_sgeexecd = 1;
    }
    # 若需要重启执行服务
    if ( $if_reboot_sgeexecd == 1 ) {
        $cmdString = "sudo /opt/sge/default/common/sgeexecd restart";
        system($cmdString) == 0 or die "Failed to execute: $cmdString\n";
    }
}

# 删除 /tmp/keep_SGE.log 文件
my $cmdString = "sudo /bin/rm -rf /tmp/keep_SGE.log";
print "CMD: $cmdString\n";
system($cmdString);

# 最后给出 qstat -f 的检测结果
print "\n程序运行结束，给出 qstat -f 的结果如下：\n";
$cmdString = "export SGE_ROOT=/opt/sge && /opt/sge/bin/lx-amd64/qstat -f";
system($cmdString) == 0 or die "Failed to execute: $cmdString\n";
print "\n";
