#!/usr/local/bin/perl5 # bmonitor v2.12 for LSF # Christian Rossi (rossi@loria.fr) # Centre Charles Hermite/LORIA (http://cch.loria.fr/LSF/bmonitor) - Nancy - France # License for bmonitor : GNU General Public License # (http://www.gnu.org/copyleft/gpl.html) # v2.0 2000/03/20 # v2.01 2000/04/25 # v2.02 2000/04/27 # v2.1 2000/06/05 # v2.11 2000/07/07 new f_nb_proc # v2.12 2000/09/05 modification of f_nb_proc my $VERSION = 2.12; system("clear"); # first display $first = "true"; while (1) { # counter for the not pending jobs $job_count = 0; # default : bjobs -u all if (! @ARGV) { open(BJOBS,"bjobs -u all 2>&1|"); } else { open(BJOBS,"bjobs @ARGV 2>&1|"); } # if no job exit $first_ligne=; if ($first_ligne =~ /No.*job found/) { print "$_"; exit; } # other possible display #print "JOBID USER STAT QUEUE PROC MEM SWAP CPUTIME RUNTIME RUNLIM EFF HOG EXECHOST SUBMITTIME\n"; $p_first_ligne ="CPU MEM SWAP CPUTIM JOB USER STAT QUEUE RUNTIM RUNLIM EFF HOG EXECHOST SUBMIT_TIME\n"; $p_second_ligne="-------------------------------------------------------------------------------------------------------\n"; # first time print now, after print in buffer if ($first ne "true") { $p_lignes = "$p_first_ligne" . "$p_second_ligne"; } else { print $p_first_ligne; print $p_second_ligne; } @lignes_jobs=; # lignes of bjob foreach $ligne (@lignes_jobs) { # work with ligne that begin with a job number if ($ligne =~ /^[0-9][0-9][0-9][0-9]/) { # decrease time to wait for each not pending job and print it if (($first ne "true") && ($stat ne PEND)) { if ($old_job_count < 0) { $old_job_count = 0; } $p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count); syswrite(STDOUT,"$p_old_job_count",55); $old_job_count = $old_job_count - 1; } ($jobid,$user,$stat,$queue,$from_host,$exec_host)=split(/ +/,$ligne); open(BJOBSL,"bjobs -l $jobid |"); @bjobsl_out = ; if ($stat ne PEND) { # run bhist if job is not pending open(BHIST,"bhist $jobid|"); @bhist_out = ; $job_count = $job_count + 1; # job name $job_name = $ligne; $job_name =~ s/^.{56}//; $job_name =~ s/.{13}$//; chop($job_name); # submit time $submit_time = $ligne; $submit_time =~ s/^.{67}//; chop($submit_time); # number of proc $nb_proc = &f_nb_proc($jobid); # memory and swap @mem_swap = &f_mem_swap($jobid); $mem = $mem_swap[0]; $swap = $mem_swap[1]; # cpu time @total_hour_min = &f_cpu_time($jobid); $cpu_time_in_sec = $total_hour_min[0]; $cpu_time_hour = $total_hour_min[1]; $cpu_time_min = $total_hour_min[2]; # run time @total_hour_min = &f_run_time($jobid); $run_time_in_sec = $total_hour_min[0]; $run_time_hour = $total_hour_min[1]; $run_time_min = $total_hour_min[2]; # run limit @total_hour_min = &f_run_limit($jobid); $run_limit_in_sec = $total_hour_min[0]; $run_limit_hour = $total_hour_min[1]; $run_limit_min = $total_hour_min[2]; # eff (100 * cpu_time / (nb_proc * run_time)) $efficasity = &f_efficasity($jobid); # hog factor (100 * run_time / total_time) $hog_factor = &f_hog_factor($jobid); } # if ne PEND if ($stat eq PEND) { # job name $job_name = $ligne; $job_name =~ s/^.{66}//; $job_name =~ s/.{13}$//; chop($job_name); # submit time $submit_time = $ligne; $submit_time =~ s/^.{67}//; $submit_time =~ s/.{13}$//; chop($submit_time); # number of proc $nb_proc = &f_nb_proc($jobid); # memory and swap $mem = ""; $swap = ""; # cpu time $cpu_time_hour = ""; $cpu_time_min = ""; # run time $run_time_hour = ""; $run_time_min = ""; # run limit @total_hour_min = &f_run_limit($jobid); $run_limit_in_sec = $total_hour_min[0]; $run_limit_hour = $total_hour_min[1]; $run_limit_min = $total_hour_min[2]; # eff (cpu time / (nb_proc * run_time)) $efficasity = ""; # hog factor (100 * run_time / total_time) $hog_factor = ""; } # if eq PEND # format value with sprintf $p_jobid = sprintf("%5s",$jobid); $p_user = sprintf("%-8s",$user); $p_stat = sprintf("%-5s",$stat); $p_queue = sprintf("%-10s",$queue); $p_nb_proc = sprintf("%2s",$nb_proc); $p_mem = sprintf("%5.0f",$mem); $p_swap = sprintf("%5.0f",$swap); $p_run_limit = sprintf("%3s:%02d",$run_limit_hour,$run_limit_min); if ($stat ne PEND) { $p_cpu_time = sprintf("%3s:%02d",$cpu_time_hour,$cpu_time_min); $p_run_time = sprintf("%3s:%02d",$run_time_hour,$run_time_min); $p_exec_host = sprintf("%-8s",$exec_host); $p_efficasity = sprintf("%5.1f%%",$efficasity); $p_hog_factor = sprintf("%5.1f%%",$hog_factor); } else { $p_cpu_time = sprintf("%6s"); $p_run_time = sprintf("%6s"); $p_exec_host = sprintf("%8s"); $p_efficasity = sprintf("%6s"); $p_hog_factor = sprintf("%6s"); } if ($first ne "true") { $lignes = sprintf("%s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",$p_nb_proc,$p_mem,$p_swap,$p_cpu_time,$p_jobid,$p_user,$p_stat,$p_queue,$p_run_time,$p_run_limit,$p_efficasity,$p_hog_factor,$p_exec_host,$submit_time); $p_lignes = "$p_lignes" . "$lignes"; } else { print "$p_nb_proc $p_mem $p_swap $p_cpu_time $p_jobid $p_user $p_stat $p_queue $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n" } } # if begin with job number [0-9] } # foreach ligne of bjob if ($first eq "false") { # display the job in one time system("clear"); print "$p_lignes"; } # next display $first = "false"; # print bhosts and lsload print "\n"; system("bhosts"); print "\n"; system("lsload"); print "\n"; # date: 2000/03/21 10:44:46 $date=`date '+%Y/%m/%d %H:%M:%S'`; chop($date); # user wait for about 50 sec $old_job_count = 50; if ($job_count <= 50) { # wait time = sleep time + time to run bjob -l and bhist $sleep_time = 50 - $job_count ; } # print the date and the delay before update for ($t=0; $t <= $sleep_time ; $t++) { $p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count); syswrite(STDOUT,"$p_old_job_count",55); $old_job_count = $old_job_count - 1; sleep(1); } } # while 1 # other possible display #print "$p_jobid $p_user $p_stat $p_queue $p_nb_proc $p_mem $p_swap $p_cpu_time $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n" ######################################################################################## ######################################################################################## ## fonctions for bmonitor ## ######################################################################################## ######################################################################################## sub f_efficasity{ $l_effi = ""; $l_nb_proc = $nb_proc; $l_run_time = $run_time_in_sec; $l_cpu_time = $cpu_time_in_sec; $l_effi = $l_nb_proc * $l_run_time; if ( $l_effi > 0 ) { $l_effi = 100 * $l_cpu_time / $l_effi; if ( $l_effi > 999.9 ) { $l_effi = 999.9 } } else { $l_effi = 999.9; } return "$l_effi"; } ###################################### # f_mem_swap # memory ans swap of the current job # use bjob -l $jobid ###################################### sub f_mem_swap { $l_mem = ""; $l_swap = ""; foreach $l_ligne (@bjobsl_out) { if ($l_ligne =~ /MEM: /) { @l_word=split(/ +/,$l_ligne) ; $l_mem = $l_word[2]; $l_mem_unit = $l_word[3]; $l_swap = $l_word[5]; $l_swap_unit = $l_word[6]; last; } } if ($l_mem_unit eq "Kbytes;") { $l_mem = $l_mem / 1024; } if ($l_swap_unit eq "Kbytes;") { $l_swap = $l_swap / 1024; } if ($l_swap eq "") { $l_swap = "0"; } if ($l_mem eq "") { $l_mem = "0"; } return ($l_mem,$l_swap); } ##################################################### # f_cpu_time # cpu time of the current job # $l_cpu_time: total cpu time in second # $l_hour, $l_min: hours and minutes of the cpu time # use bjob -l $jobid ##################################################### sub f_cpu_time { $l_cpu_time = ""; foreach $l_ligne (@bjobsl_out) { if ($l_ligne =~ /seconds/) { @l_word=split(/ +/,$l_ligne) ; $l_cpu_time=$l_word[6]; last; } } # conversion in hh:mm $l_sec = 0; $l_min = 0; $l_hour = 0; if ( $l_cpu_time >= 60 ) { $l_min = $l_cpu_time / 60; $l_min = int($l_min); $l_sec = $l_cpu_time - ($l_min * 60) ; } if ( "$l_min" >= 60 ) { $l_hour = $l_min / 60; $l_hour = int($l_hour); $l_min = $l_min - ($l_hour * 60); } return ($l_cpu_time,$l_hour,$l_min); } ####################################################### # f_run_time # run time of the current job # $l_run_time: total run time in second # $l_hour, $l_min: hours and minutes of the run time # use bhist $jobid ####################################################### sub f_run_time { $l_run_time = ""; foreach $l_ligne (@bhist_out) { if ($l_ligne =~ /^[0-9]{3,}/) { # for not modify @bhist_out $l_copy = $l_ligne; # delete job name because space caracter $l_copy =~ s/.{25}//; @l_word = split(/ +/,$l_copy) ; $l_run_time = $l_word[3]; last; } } # conversion in hh:mm $l_sec = 0; $l_min = 0; $l_hour = 0; if ( $l_run_time >= 60 ) { $l_min = $l_run_time / 60; $l_min = int($l_min); $l_sec = $l_run_time - ($l_min * 60) ; } if ( "$l_min" >= 60 ) { $l_hour = $l_min / 60; $l_hour = int($l_hour); $l_min = $l_min - ($l_hour * 60); } return ($l_run_time,$l_hour,$l_min); } ########################################################### # f_run_limit # run limit of the current job # $l_run_limit: total limit time in second # $l_hour, $l_min: hours and minutes of the limit time # users must ask for a run limit (else it's a cpu limit) # use bjob -l $jobid ########################################################### sub f_run_limit { $l_run_limit = ""; foreach $l_ligne (@bjobsl_out) { if ($l_ligne =~ /[0-9] min of/) { @l_word = split(/ +/,$l_ligne) ; $l_run_limit = $l_word[1]; # if cpu limit and run limit if ($l_word[5] =~ /[0-9]/) { $l_run_limit = $l_word[5]; } last; } } $l_sec=0; $l_min=0; $l_hour=0; if ( $l_run_limit >= 60 ) { $l_hour = $l_run_limit / 60; $l_hour = int($l_hour); $l_min = $l_run_limit - ($l_hour * 60); } else { $l_hour = 0; $l_min = int($l_run_limit); } return ($l_run_limit,$l_hour,$l_min); } ######################################################## # f_nb_proc # number of processors of the job # search Processors in paragraph Submitted from host # use bjobs -l $jobid$ ########################################################## sub f_nb_proc { $l_nb_cpu = ""; $l_paragraph = ""; $l_lignes_proc = false; foreach $l_ligne (@bjobsl_out) { # if no empty ligne and in good paragraph if (($l_ligne =~ /[\w\d]/) & ($l_lignes_proc eq true)) { # delete space at the begining $l_ligne =~ s/^\s+//g; $l_ligne =~ s/,/, /g; # delete RC chop($l_ligne); if ($l_ligne =~ /Processors$/) { $l_ligne = $l_ligne . " "} # add each ligne of the paragraph $l_paragraph = "$l_paragraph" . "$l_ligne"; } # empty ligne: end of the good paragraph and search nb of proc elsif (($l_ligne =~ /^$/) & ($l_lignes_proc eq true)) { $l_lignes_proc = false; } # enter in good paragraph elsif (($l_ligne =~ /Submitted from host/) && ($l_lignes_proc eq false)) { chop($l_ligne); $l_paragraph = $l_ligne; $l_paragraph =~ s/,/, /g; $l_lignes_proc = true; } # Submitted from host } # foreach ligne $l_lignes_proc = false; @l_words = split(/ +/,$l_paragraph); foreach $l_word (@l_words) { if ($l_word eq Processors) { $l_nb_cpu = $l_prev_word; last; } elsif ($l_word =~ /Processors/) { $l_nb_cpu = $l_word; $l_nb_cpu =~ s/Processors//; last; } # fi $word ~ Processors $l_prev_word = $l_word; } # foreach if ( ! $l_nb_cpu ) { $l_nb_cpu = 1; } return $l_nb_cpu; } ######################################################## # f_hog_factor # hog factor = run time / turnaround time # turnaround time = PEND + PSUSP + RUN + USUSP + SSUSP # use bhist $jobid ######################################################## sub f_hog_factor { $l_total_time = ""; $l_run_time = ""; foreach $l_ligne (@bhist_out) { if ($l_ligne =~ /^[0-9]{3,}/) { # for not modify @bhist_out $l_copy = $l_ligne; # delete job name because space caracter $l_copy =~ s/.{25}//; @l_word = split(/ +/,$l_copy); $l_run_time = $l_word[3]; $l_total_time = $l_word[7]; last; } # fi } # foreach $l_ligne if ($l_total_time != 0) { $l_hog_factor = 100 * $l_run_time / $l_total_time; if ( $l_hog_factor > 999.9 ) { $l_hog_factor = 999.9; } } else { # $l_total_time = 0 $l_hog_factor = 999.9; } return $l_hog_factor; } # # ######################################################################### __END__ =head1 NAME bmonitor - display information about LSF jobs and hosts =head1 DESCRIPTION bmonitor is a perl script to monitor LSF jobs. Every minute the script show for each jobs useful informations. This script use LSF (Load Sharing Facility). =head1 README bmonitor is a perl script to monitor LSF jobs. Every minute the script show for each jobs useful informations: CPU : number of processors asked par the users MEM : memory used by the job (MB) SWAP : swap used by the job (MB) CPUTIM : cpu time of the job (hh:mm) JOB : number identification of the job USER : user login STAT : status of the job (PEND, PSUSP, USUSP, SSUSP, RUN) QUEUE : name of the queue RUNTIM : run time, time spend by the job in RUN status (hh:mm) RUNLIM : maximun run time value asked by the user (hh:mm) EFF : cpu time / (run time * number of proc) HOG : run time / total time the job spend in LSF EXECHOST : execution host SUBMIT_TIME : date of the soumission The options of bjob can be use with bmonitor. This perl script is disponible from http://cch.loria.fr/LSF/bmonitor/ and use the GNU General Public License. For more informations send a mail to Christian.Rossi@loria.fr. Centre Charles Hermite/LORIA - Nancy - France http://cch.loria.fr/ http://www.loria.fr/ =head1 AUTHOR Christian rossi =pod OSNAMES Unix =pod SCRIPT CATEGORIES UNIX/System_administration =cut