root / plugins / boinc / boinc_estwk @ 17f78427
Historique | Voir | Annoter | Télécharger (13,2 ko)
| 1 |
#!/usr/bin/perl -w |
|---|---|
| 2 |
# |
| 3 |
# boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs |
| 4 |
# |
| 5 |
# Run 'perldoc boinc_estwk' for full man page |
| 6 |
# |
| 7 |
# Author: Palo M. <palo.gm@gmail.com> |
| 8 |
# License: GPLv3 <http://www.gnu.org/licenses/gpl-3.0.txt> |
| 9 |
# |
| 10 |
# |
| 11 |
# Parameters supported: |
| 12 |
# config |
| 13 |
# |
| 14 |
# |
| 15 |
# Configurable variables |
| 16 |
# boinccmd - command-line control program (default: boinccmd) |
| 17 |
# host - Host to query (default: none) |
| 18 |
# port - GUI RPC port (default: none = use BOINC-default) |
| 19 |
# boincdir - Directory containing appropriate password file |
| 20 |
# gui_rpc_auth.cfg (default: none) |
| 21 |
# estwk_warn - Warning level - minimum estimated work (default: 24.00 hours) |
| 22 |
# password - Password for BOINC (default: none) !!! UNSAFE !!! |
| 23 |
# |
| 24 |
# |
| 25 |
# $Log$ |
| 26 |
# |
| 27 |
# Revision 1.0 2009/09/13 Palo M. |
| 28 |
# Add documentation and license information |
| 29 |
# Ready to publish on Munin Exchange |
| 30 |
# Revision 0.9 2009/09/13 Palo M. |
| 31 |
# Add possibility to read password from file |
| 32 |
# Revision 0.8 2009/09/12 Palo M. |
| 33 |
# Update default binary name: boinc_cmd -> boinccmd |
| 34 |
# Revision 0.7 2008/08/30 Palo M. |
| 35 |
# Creation - Attempt to port functionality from C++ code |
| 36 |
# |
| 37 |
# (Revisions 0.1 - 0.6) were done in C++ |
| 38 |
# |
| 39 |
# |
| 40 |
# |
| 41 |
# Magic markers: |
| 42 |
#%# family=contrib |
| 43 |
|
| 44 |
use strict; |
| 45 |
|
| 46 |
|
| 47 |
######################################################################### |
| 48 |
# 1. Parse configuration variables |
| 49 |
# |
| 50 |
my $BOINCCMD = exists $ENV{'boinccmd'} ? $ENV{'boinccmd'} : "boinccmd";
|
| 51 |
my $HOST = exists $ENV{'host'} ? $ENV{'host'} : undef;
|
| 52 |
my $PORT = exists $ENV{'port'} ? $ENV{'port'} : undef;
|
| 53 |
my $PASSWORD = exists $ENV{'password'} ? $ENV{'password'} : undef;
|
| 54 |
my $BOINCDIR = exists $ENV{'boincdir'} ? $ENV{'boincdir'} : undef;
|
| 55 |
my $ESTWKWRN = exists $ENV{'estwk_warn'} ? $ENV{'estwk_warn'} : 24;
|
| 56 |
|
| 57 |
######################################################################### |
| 58 |
# 2. Basic executable |
| 59 |
# |
| 60 |
if (defined $HOST) {
|
| 61 |
$BOINCCMD .= " --host $HOST"; |
| 62 |
if (defined $PORT) {
|
| 63 |
$BOINCCMD .= ":$PORT"; |
| 64 |
} |
| 65 |
} |
| 66 |
if (defined $PASSWORD) {
|
| 67 |
$BOINCCMD .= " --passwd $PASSWORD"; |
| 68 |
} |
| 69 |
if (defined $BOINCDIR) {
|
| 70 |
chdir $BOINCDIR; |
| 71 |
} |
| 72 |
|
| 73 |
######################################################################### |
| 74 |
# 3. Get host info, to retrieve number of CPUs |
| 75 |
# |
| 76 |
my $nCPUs; |
| 77 |
my $hostInfo = `$BOINCCMD --get_host_info 2>/dev/null`; |
| 78 |
if ($hostInfo ne "") {
|
| 79 |
my @hostInfo = split /\n/, $hostInfo; |
| 80 |
my @nCPUs = grep /^\s+#CPUS: /,@hostInfo; |
| 81 |
if ($#nCPUs != 0) { die "Unexpected output from boinccmd"; }
|
| 82 |
$nCPUs = $nCPUs[0]; |
| 83 |
$nCPUs =~ s/^\s+#CPUS: //; |
| 84 |
no warnings; # for following line only |
| 85 |
if ($nCPUs < 1) { die "Unexpected output from boinccmd"; }
|
| 86 |
} |
| 87 |
else {
|
| 88 |
# No host info (e.g. client not running) |
| 89 |
exit -1; |
| 90 |
} |
| 91 |
|
| 92 |
#print "$nCPUs\n"; |
| 93 |
|
| 94 |
######################################################################### |
| 95 |
# 4. Display config if applicable |
| 96 |
# |
| 97 |
if ( (defined $ARGV[0]) && ($ARGV[0] eq "config") ) {
|
| 98 |
|
| 99 |
if (defined $HOST) {
|
| 100 |
print "host_name $HOST\n"; |
| 101 |
} |
| 102 |
|
| 103 |
print "graph_title BOINC work cache estimation\n"; |
| 104 |
print "graph_category htc\n"; |
| 105 |
print "graph_args --base 1000 -l 0 --alt-autoscale-max\n"; |
| 106 |
print "graph_vlabel Hours\n"; |
| 107 |
print "graph_scale no\n"; |
| 108 |
|
| 109 |
# Longest WU is AREA, each CPU estimated is LINE2 |
| 110 |
print "longest.label Longest WU\n"; |
| 111 |
print "longest.draw AREA\n"; |
| 112 |
print "longest.type GAUGE\n"; |
| 113 |
for (my $i = 0; $i < $nCPUs; ++$i) {
|
| 114 |
print "cpu$i.label CPU$i\n"; |
| 115 |
print "cpu$i.draw LINE2\n"; |
| 116 |
print "cpu$i.type GAUGE\n"; |
| 117 |
printf "cpu$i.warning %.2f:\n",$ESTWKWRN; |
| 118 |
print "cpu$i.critical 0:\n"; |
| 119 |
} |
| 120 |
|
| 121 |
exit 0; |
| 122 |
} |
| 123 |
|
| 124 |
######################################################################### |
| 125 |
# 5. Fetch all needed data from BOINC-client with single call |
| 126 |
# |
| 127 |
my $prj_status = ""; |
| 128 |
my $results = ""; |
| 129 |
|
| 130 |
my $simpleGuiInfo = `$BOINCCMD --get_simple_gui_info 2>/dev/null`; |
| 131 |
if ($simpleGuiInfo ne "") {
|
| 132 |
# Some data were retrieved, so let's split them |
| 133 |
my @sections; |
| 134 |
my @section1; |
| 135 |
@sections = split /=+ Projects =+\n/, $simpleGuiInfo; |
| 136 |
@section1 = split /=+ [A-z]+ =+\n/, $sections[1]; |
| 137 |
$prj_status = $section1[0]; |
| 138 |
|
| 139 |
@sections = split /=+ Results =+\n/, $simpleGuiInfo; |
| 140 |
@section1 = split /=+ [A-z]+ =+\n/, $sections[1]; |
| 141 |
$results = $section1[0]; |
| 142 |
} |
| 143 |
|
| 144 |
######################################################################### |
| 145 |
# 6. Parse BOINC data |
| 146 |
# |
| 147 |
# 6.a) Get suspended projects |
| 148 |
my @prjInfos = split /\d+\) -+\n/, $prj_status; |
| 149 |
shift @prjInfos; # Throw out first empty line |
| 150 |
|
| 151 |
my @susp_projects; # array of suspended projects |
| 152 |
for my $prj_info (@prjInfos) {
|
| 153 |
my @lines = split /\n/, $prj_info; |
| 154 |
my @prjURL = grep /^\s+master URL: /,@lines; |
| 155 |
if ($#prjURL != 0) {die "Unexpected output from boinccmd"; }
|
| 156 |
my $prjURL =$prjURL[0]; |
| 157 |
$prjURL =~ s/^\s+master URL: //; |
| 158 |
my @suspGUI = grep /^\s+suspended via GUI: /,@lines; |
| 159 |
if ($#suspGUI != 0) {die "Unexpected output from boinccmd"; }
|
| 160 |
my $suspGUI =$suspGUI[0]; |
| 161 |
$suspGUI =~ s/^\s+suspended via GUI: //; |
| 162 |
if ($suspGUI eq "yes") {
|
| 163 |
push @susp_projects, $prjURL |
| 164 |
} |
| 165 |
} |
| 166 |
for my $i (@susp_projects) { print "$i\n"; }
|
| 167 |
|
| 168 |
# 6.b) Parse results, check their states |
| 169 |
# Get those which are NOT suspended by GUI |
| 170 |
my @rsltInfos = split /\d+\) -+\n/, $results; |
| 171 |
shift @rsltInfos; # Throw out first empty line |
| 172 |
my @rsltRemain; |
| 173 |
|
| 174 |
for my $rslt_info (@rsltInfos) {
|
| 175 |
my @lines = split /\n/, $rslt_info; |
| 176 |
my @estRemain = grep /^\s+estimated CPU time remaining: /,@lines; |
| 177 |
my $estRemain = $estRemain[0]; |
| 178 |
$estRemain =~ s/^\s+estimated CPU time remaining: //; |
| 179 |
my @schedstat = grep /^\s+scheduler state: /,@lines; |
| 180 |
my $schedstat = $schedstat[0]; |
| 181 |
$schedstat =~ s/^\s+scheduler state: //; |
| 182 |
my @state = grep /^\s+state: /,@lines; |
| 183 |
my $state = $state[0]; |
| 184 |
$state =~ s/^\s+state: //; |
| 185 |
my @acttask = grep /^\s+active_task_state: /,@lines; |
| 186 |
my $acttask = $acttask[0]; |
| 187 |
$acttask =~ s/^\s+active_task_state: //; |
| 188 |
my @suspGUI = grep /^\s+suspended via GUI: /,@lines; |
| 189 |
my $suspGUI =$suspGUI[0]; |
| 190 |
$suspGUI =~ s/^\s+suspended via GUI: //; |
| 191 |
my @prjURL = grep /^\s+project URL: /,@lines; |
| 192 |
my $prjURL =$prjURL[0]; |
| 193 |
$prjURL =~ s/^\s+project URL: //; |
| 194 |
if ($suspGUI eq "yes") {
|
| 195 |
# This result is not in work cache - at the moment |
| 196 |
next; |
| 197 |
} |
| 198 |
my @suspPRJ = grep /^$prjURL$/,@susp_projects; |
| 199 |
if ($#suspPRJ == 0) {
|
| 200 |
# This result is not in work cache - at the moment |
| 201 |
next; |
| 202 |
} |
| 203 |
if ($state eq "2") {
|
| 204 |
# RESULT_FILES_DOWNLOADED |
| 205 |
if ( ($schedstat eq "0") || |
| 206 |
($schedstat eq "1") ) {
|
| 207 |
# CPU_SCHED_UNINITIALIZED 0 |
| 208 |
# Not started yet: result is available in work cache |
| 209 |
# CPU_SCHED_PREEMPTED 1 |
| 210 |
# preempted: result is available in work cache |
| 211 |
push @rsltRemain,$estRemain; |
| 212 |
next; |
| 213 |
} |
| 214 |
if ($schedstat eq "2") {
|
| 215 |
# CPU_SCHED_SCHEDULED 2 |
| 216 |
if ( ($acttask eq "1") || |
| 217 |
($acttask eq "0") || |
| 218 |
($acttask eq "9") ) {
|
| 219 |
# PROCESS_EXECUTING 1 |
| 220 |
# running |
| 221 |
# PROCESS_UNINITIALIZED 0 |
| 222 |
# PROCESS_SUSPENDED 9 |
| 223 |
# suspended by "user active"/benchmark? |
| 224 |
# available in work cache |
| 225 |
push @rsltRemain,$estRemain; |
| 226 |
next; |
| 227 |
} |
| 228 |
# other active-task-state - maybe failing/aborted WU |
| 229 |
# => not in work cache |
| 230 |
next; |
| 231 |
} |
| 232 |
# There should be no other scheduler state |
| 233 |
next; |
| 234 |
} |
| 235 |
# RESULT_FILES_DOWNLOADING |
| 236 |
# RESULT_COMPUTE_ERROR |
| 237 |
# RESULT_FILES_UPLOADING |
| 238 |
# RESULT_FILES_UPLOADED |
| 239 |
# RESULT_ABORTED |
| 240 |
# => not in work cache |
| 241 |
} |
| 242 |
|
| 243 |
######################################################################### |
| 244 |
# 7. Distribute remaining results per CPUs |
| 245 |
# |
| 246 |
# 7.a) Sort remaining results descending |
| 247 |
my @sortRemain = sort {$b <=> $a} @rsltRemain;
|
| 248 |
|
| 249 |
# 7.b) Assign to CPU with smallest workcache |
| 250 |
my @CPUcache; |
| 251 |
for (my $i = 0; $i < $nCPUs; ++$i) {
|
| 252 |
$CPUcache[$i] = 0; |
| 253 |
} |
| 254 |
|
| 255 |
for my $length (@sortRemain) {
|
| 256 |
# find CPU with smallest workcache: |
| 257 |
my @sortedCPUs = sort {$a <=> $b} @CPUcache;
|
| 258 |
$sortedCPUs[0] = $sortedCPUs[0] + $length; |
| 259 |
@CPUcache = @sortedCPUs; |
| 260 |
} |
| 261 |
|
| 262 |
# At the end, sort CPUs descending |
| 263 |
@CPUcache = sort {$b <=> $a} @CPUcache;
|
| 264 |
|
| 265 |
######################################################################### |
| 266 |
# 8. Display output |
| 267 |
# |
| 268 |
|
| 269 |
# Convert from seconds to hours |
| 270 |
printf "longest.value %.2f\n",$sortRemain[0]/3600; |
| 271 |
for (my $i = 0; $i < $nCPUs; ++$i) {
|
| 272 |
printf "cpu$i.value %.2f\n",$CPUcache[$i]/3600; |
| 273 |
} |
| 274 |
|
| 275 |
exit 0; |
| 276 |
|
| 277 |
|
| 278 |
######################################################################### |
| 279 |
# perldoc section |
| 280 |
|
| 281 |
=head1 NAME |
| 282 |
|
| 283 |
boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs |
| 284 |
|
| 285 |
=head1 APPLICABLE SYSTEMS |
| 286 |
|
| 287 |
Linux machines running BOINC and munin-node |
| 288 |
|
| 289 |
- or - |
| 290 |
|
| 291 |
Linux servers (running munin-node) used to collect data from other systems |
| 292 |
which are running BOINC, but not running munin-node (e.g. non-Linux systems) |
| 293 |
|
| 294 |
=head1 CONFIGURATION |
| 295 |
|
| 296 |
Following configuration variables are supported: |
| 297 |
|
| 298 |
=over 12 |
| 299 |
|
| 300 |
=item B<boinccmd> |
| 301 |
|
| 302 |
command-line control program (default: boinccmd) |
| 303 |
|
| 304 |
=item B<host> |
| 305 |
|
| 306 |
Host to query (default: none) |
| 307 |
|
| 308 |
=item B<port> |
| 309 |
|
| 310 |
GUI RPC port (default: none = use BOINC-default) |
| 311 |
|
| 312 |
=item B<boincdir> |
| 313 |
|
| 314 |
Directory containing appropriate file gui_rpc_auth.cfg (default: none) |
| 315 |
|
| 316 |
=item B<estwk_warn> |
| 317 |
|
| 318 |
Warning level - minimum estimated work (default: 24.00 hours) |
| 319 |
|
| 320 |
=item B<password> |
| 321 |
|
| 322 |
Password for BOINC (default: none) |
| 323 |
|
| 324 |
=back |
| 325 |
|
| 326 |
=head2 B<Security Consideration:> |
| 327 |
|
| 328 |
Using of variable B<password> poses a security risk. Even if the Munin |
| 329 |
configuration file for this plugin containing BOINC-password is properly |
| 330 |
protected, the password is exposed as environment variable and finally passed |
| 331 |
to boinccmd as a parameter. It is therefore possible for local users of the |
| 332 |
machine running this plugin to eavesdrop the BOINC password. |
| 333 |
|
| 334 |
Using of variable password is therefore strongly discouraged and is left here |
| 335 |
as a legacy option and for testing purposes. |
| 336 |
|
| 337 |
It should be always possible to use B<boincdir> variable instead - in such case |
| 338 |
the file gui_rpc_auth.cfg is read by boinccmd binary directly. |
| 339 |
If this plugin is used to fetch data from remote system, the gui_rpc_auth.cfg |
| 340 |
can be copied to special directory in a secure way (e.g. via scp) and properly |
| 341 |
protected by file permissions. |
| 342 |
|
| 343 |
=head1 INTERPRETATION |
| 344 |
|
| 345 |
This plugin shows the estimated remaining computation time for all CPUs of |
| 346 |
the machine and the estimated remaining computation time of longest workunit. |
| 347 |
The estimation is based on assumption that the workunits of different lengths |
| 348 |
will be distributed to the CPUs evenly (which is not always the case). |
| 349 |
|
| 350 |
The warning level can be used to warn in forward about the risk of workunits |
| 351 |
local cache depletion and BOINC client running out of the work. |
| 352 |
Although such warning can be achieved by configuring Munin master, there is |
| 353 |
also this option to configure it on munin-node side. |
| 354 |
|
| 355 |
=head1 EXAMPLES |
| 356 |
|
| 357 |
=head2 Local BOINC Example |
| 358 |
|
| 359 |
BOINC is running on local machine. The BOINC binaries are installed in |
| 360 |
F</opt/boinc/custom-6.10.1/>, the BOINC is running in directory |
| 361 |
F</usr/local/boinc/> under username boinc, group boinc and the password is used |
| 362 |
to protect access to BOINC. |
| 363 |
Warning will be set when estimated work for any of CPUs will decrease under |
| 364 |
48 hours: |
| 365 |
|
| 366 |
[boinc_*] |
| 367 |
group boinc |
| 368 |
env.boinccmd /opt/boinc/custom-6.10.1/boinccmd |
| 369 |
env.boincdir /usr/local/boinc |
| 370 |
env.warn 48 |
| 371 |
|
| 372 |
=head2 Remote BOINC Example |
| 373 |
|
| 374 |
BOINC is running on 2 remote machines C<foo> and C<bar>. |
| 375 |
On the local machine the binary of command-line interface is installed in |
| 376 |
directory F</usr/local/bin/>. |
| 377 |
The BOINC password used on the remote machine C<foo> is stored in file |
| 378 |
F</etc/munin/boinc/foo/gui_rpc_auth.cfg>. |
| 379 |
The BOINC password used on the remote machine C<bar> is stored in file |
| 380 |
F</etc/munin/boinc/bar/gui_rpc_auth.cfg>. |
| 381 |
These files are owned and readable by root, readable by group munin and not |
| 382 |
readable by others. |
| 383 |
There are 2 symbolic links to this plugin created in the munin plugins |
| 384 |
directory (usually F</etc/munin/plugins/>): F<snmp_foo_boincestwk> and |
| 385 |
F<snmp_bar_boincestwk> |
| 386 |
|
| 387 |
[snmp_foo_boinc*] |
| 388 |
group munin |
| 389 |
env.boinccmd /usr/local/bin/boinccmd |
| 390 |
env.host foo |
| 391 |
env.boincdir /etc/munin/boinc/foo |
| 392 |
|
| 393 |
[snmp_bar_boinc*] |
| 394 |
group munin |
| 395 |
env.boinccmd /usr/local/bin/boinccmd |
| 396 |
env.host bar |
| 397 |
env.boincdir /etc/munin/boinc/bar |
| 398 |
|
| 399 |
This way the plugin can be used by Munin the same way as the Munin plugins |
| 400 |
utilizng SNMP (although this plugin itself does not use SNMP). |
| 401 |
|
| 402 |
=head1 BUGS |
| 403 |
|
| 404 |
The estimation is based on simple assumption, that longest workunits will be |
| 405 |
processed first. This is the case when work is distributed evenly among CPUs. |
| 406 |
But this is not always the case, because various deadlines for various |
| 407 |
workunits may fire the "panic mode" of BOINC and scheduling could be much |
| 408 |
different. |
| 409 |
For example, there can be 4 CPUs, and BOINC having downloaded 4 workunits |
| 410 |
with estimated run-time 1 hour each and 3 workunits with estimated run-time |
| 411 |
4 hours each. |
| 412 |
This Munin plugin will report estimated work 4 hours for each CPU. |
| 413 |
But if deadline of those 1-hour workunits will be much shorter than deadline |
| 414 |
of those 4-hours workunits, BOINC will schedule short workunits first (for all |
| 415 |
4 CPUs) and after finishing them it will schedule those long workunits. |
| 416 |
This will result in real computation for 5 hours on 3 CPUs but only 1 hour on |
| 417 |
remaining 4th CPU. So after 1 hour of computation 1 of CPUs will run out of |
| 418 |
work. |
| 419 |
|
| 420 |
There is no C<autoconf> capability at the moment. This is due to the fact, that |
| 421 |
BOINC installations may vary over different systems, sometimes using default |
| 422 |
directory from distribution (e.g. F</var/lib/boinc/> in Debian or Ubuntu), but |
| 423 |
often running in user directories or in other separate directories. |
| 424 |
Also the user-ID under which BOINC runs often differs. |
| 425 |
Under these circumstances the C<autoconf> would be either lame or too |
| 426 |
complicated. |
| 427 |
|
| 428 |
=head1 AUTHOR |
| 429 |
|
| 430 |
Palo M. <palo.gm@gmail.com> |
| 431 |
|
| 432 |
=head1 LICENSE |
| 433 |
|
| 434 |
GPLv3 L<http://www.gnu.org/licenses/gpl-3.0.txt> |
| 435 |
|
| 436 |
=cut |
| 437 |
|
| 438 |
# vim:syntax=perl |
