root / plugins / boinc / boinc_estwk @ 17f78427
Historique | Voir | Annoter | Télécharger (13,2 ko)
| 1 | f1cbf1ac | Palo M | #!/usr/bin/perl -w |
|---|---|---|---|
| 2 | # |
||
| 3 | # boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs |
||
| 4 | # |
||
| 5 | # Run 'perldoc boinc_estwk' for full man page |
||
| 6 | # |
||
| 7 | # Author: Palo M. <palo.gm@gmail.com> |
||
| 8 | # License: GPLv3 <http://www.gnu.org/licenses/gpl-3.0.txt> |
||
| 9 | # |
||
| 10 | # |
||
| 11 | # Parameters supported: |
||
| 12 | # config |
||
| 13 | # |
||
| 14 | # |
||
| 15 | # Configurable variables |
||
| 16 | # boinccmd - command-line control program (default: boinccmd) |
||
| 17 | # host - Host to query (default: none) |
||
| 18 | # port - GUI RPC port (default: none = use BOINC-default) |
||
| 19 | # boincdir - Directory containing appropriate password file |
||
| 20 | # gui_rpc_auth.cfg (default: none) |
||
| 21 | # estwk_warn - Warning level - minimum estimated work (default: 24.00 hours) |
||
| 22 | # password - Password for BOINC (default: none) !!! UNSAFE !!! |
||
| 23 | # |
||
| 24 | # |
||
| 25 | # $Log$ |
||
| 26 | # |
||
| 27 | # Revision 1.0 2009/09/13 Palo M. |
||
| 28 | # Add documentation and license information |
||
| 29 | # Ready to publish on Munin Exchange |
||
| 30 | # Revision 0.9 2009/09/13 Palo M. |
||
| 31 | # Add possibility to read password from file |
||
| 32 | # Revision 0.8 2009/09/12 Palo M. |
||
| 33 | # Update default binary name: boinc_cmd -> boinccmd |
||
| 34 | # Revision 0.7 2008/08/30 Palo M. |
||
| 35 | # Creation - Attempt to port functionality from C++ code |
||
| 36 | # |
||
| 37 | # (Revisions 0.1 - 0.6) were done in C++ |
||
| 38 | # |
||
| 39 | # |
||
| 40 | # |
||
| 41 | # Magic markers: |
||
| 42 | #%# family=contrib |
||
| 43 | |||
| 44 | use strict; |
||
| 45 | |||
| 46 | |||
| 47 | ######################################################################### |
||
| 48 | # 1. Parse configuration variables |
||
| 49 | # |
||
| 50 | my $BOINCCMD = exists $ENV{'boinccmd'} ? $ENV{'boinccmd'} : "boinccmd";
|
||
| 51 | my $HOST = exists $ENV{'host'} ? $ENV{'host'} : undef;
|
||
| 52 | my $PORT = exists $ENV{'port'} ? $ENV{'port'} : undef;
|
||
| 53 | my $PASSWORD = exists $ENV{'password'} ? $ENV{'password'} : undef;
|
||
| 54 | my $BOINCDIR = exists $ENV{'boincdir'} ? $ENV{'boincdir'} : undef;
|
||
| 55 | my $ESTWKWRN = exists $ENV{'estwk_warn'} ? $ENV{'estwk_warn'} : 24;
|
||
| 56 | |||
| 57 | ######################################################################### |
||
| 58 | # 2. Basic executable |
||
| 59 | # |
||
| 60 | if (defined $HOST) {
|
||
| 61 | $BOINCCMD .= " --host $HOST"; |
||
| 62 | if (defined $PORT) {
|
||
| 63 | $BOINCCMD .= ":$PORT"; |
||
| 64 | } |
||
| 65 | } |
||
| 66 | if (defined $PASSWORD) {
|
||
| 67 | $BOINCCMD .= " --passwd $PASSWORD"; |
||
| 68 | } |
||
| 69 | if (defined $BOINCDIR) {
|
||
| 70 | chdir $BOINCDIR; |
||
| 71 | } |
||
| 72 | |||
| 73 | ######################################################################### |
||
| 74 | # 3. Get host info, to retrieve number of CPUs |
||
| 75 | # |
||
| 76 | my $nCPUs; |
||
| 77 | my $hostInfo = `$BOINCCMD --get_host_info 2>/dev/null`; |
||
| 78 | if ($hostInfo ne "") {
|
||
| 79 | my @hostInfo = split /\n/, $hostInfo; |
||
| 80 | my @nCPUs = grep /^\s+#CPUS: /,@hostInfo; |
||
| 81 | if ($#nCPUs != 0) { die "Unexpected output from boinccmd"; }
|
||
| 82 | $nCPUs = $nCPUs[0]; |
||
| 83 | $nCPUs =~ s/^\s+#CPUS: //; |
||
| 84 | no warnings; # for following line only |
||
| 85 | if ($nCPUs < 1) { die "Unexpected output from boinccmd"; }
|
||
| 86 | } |
||
| 87 | else {
|
||
| 88 | # No host info (e.g. client not running) |
||
| 89 | exit -1; |
||
| 90 | } |
||
| 91 | |||
| 92 | #print "$nCPUs\n"; |
||
| 93 | |||
| 94 | ######################################################################### |
||
| 95 | # 4. Display config if applicable |
||
| 96 | # |
||
| 97 | if ( (defined $ARGV[0]) && ($ARGV[0] eq "config") ) {
|
||
| 98 | |||
| 99 | if (defined $HOST) {
|
||
| 100 | print "host_name $HOST\n"; |
||
| 101 | } |
||
| 102 | |||
| 103 | print "graph_title BOINC work cache estimation\n"; |
||
| 104 | 84c28707 | dipohl | print "graph_category htc\n"; |
| 105 | f1cbf1ac | Palo M | print "graph_args --base 1000 -l 0 --alt-autoscale-max\n"; |
| 106 | print "graph_vlabel Hours\n"; |
||
| 107 | print "graph_scale no\n"; |
||
| 108 | |||
| 109 | # Longest WU is AREA, each CPU estimated is LINE2 |
||
| 110 | print "longest.label Longest WU\n"; |
||
| 111 | print "longest.draw AREA\n"; |
||
| 112 | print "longest.type GAUGE\n"; |
||
| 113 | for (my $i = 0; $i < $nCPUs; ++$i) {
|
||
| 114 | print "cpu$i.label CPU$i\n"; |
||
| 115 | print "cpu$i.draw LINE2\n"; |
||
| 116 | print "cpu$i.type GAUGE\n"; |
||
| 117 | printf "cpu$i.warning %.2f:\n",$ESTWKWRN; |
||
| 118 | print "cpu$i.critical 0:\n"; |
||
| 119 | } |
||
| 120 | |||
| 121 | exit 0; |
||
| 122 | } |
||
| 123 | |||
| 124 | ######################################################################### |
||
| 125 | # 5. Fetch all needed data from BOINC-client with single call |
||
| 126 | # |
||
| 127 | my $prj_status = ""; |
||
| 128 | my $results = ""; |
||
| 129 | |||
| 130 | my $simpleGuiInfo = `$BOINCCMD --get_simple_gui_info 2>/dev/null`; |
||
| 131 | if ($simpleGuiInfo ne "") {
|
||
| 132 | # Some data were retrieved, so let's split them |
||
| 133 | my @sections; |
||
| 134 | my @section1; |
||
| 135 | @sections = split /=+ Projects =+\n/, $simpleGuiInfo; |
||
| 136 | @section1 = split /=+ [A-z]+ =+\n/, $sections[1]; |
||
| 137 | $prj_status = $section1[0]; |
||
| 138 | |||
| 139 | @sections = split /=+ Results =+\n/, $simpleGuiInfo; |
||
| 140 | @section1 = split /=+ [A-z]+ =+\n/, $sections[1]; |
||
| 141 | $results = $section1[0]; |
||
| 142 | } |
||
| 143 | |||
| 144 | ######################################################################### |
||
| 145 | # 6. Parse BOINC data |
||
| 146 | # |
||
| 147 | # 6.a) Get suspended projects |
||
| 148 | my @prjInfos = split /\d+\) -+\n/, $prj_status; |
||
| 149 | shift @prjInfos; # Throw out first empty line |
||
| 150 | |||
| 151 | my @susp_projects; # array of suspended projects |
||
| 152 | for my $prj_info (@prjInfos) {
|
||
| 153 | my @lines = split /\n/, $prj_info; |
||
| 154 | my @prjURL = grep /^\s+master URL: /,@lines; |
||
| 155 | if ($#prjURL != 0) {die "Unexpected output from boinccmd"; }
|
||
| 156 | my $prjURL =$prjURL[0]; |
||
| 157 | $prjURL =~ s/^\s+master URL: //; |
||
| 158 | my @suspGUI = grep /^\s+suspended via GUI: /,@lines; |
||
| 159 | if ($#suspGUI != 0) {die "Unexpected output from boinccmd"; }
|
||
| 160 | my $suspGUI =$suspGUI[0]; |
||
| 161 | $suspGUI =~ s/^\s+suspended via GUI: //; |
||
| 162 | if ($suspGUI eq "yes") {
|
||
| 163 | push @susp_projects, $prjURL |
||
| 164 | } |
||
| 165 | } |
||
| 166 | for my $i (@susp_projects) { print "$i\n"; }
|
||
| 167 | |||
| 168 | # 6.b) Parse results, check their states |
||
| 169 | # Get those which are NOT suspended by GUI |
||
| 170 | my @rsltInfos = split /\d+\) -+\n/, $results; |
||
| 171 | shift @rsltInfos; # Throw out first empty line |
||
| 172 | my @rsltRemain; |
||
| 173 | |||
| 174 | for my $rslt_info (@rsltInfos) {
|
||
| 175 | my @lines = split /\n/, $rslt_info; |
||
| 176 | my @estRemain = grep /^\s+estimated CPU time remaining: /,@lines; |
||
| 177 | my $estRemain = $estRemain[0]; |
||
| 178 | $estRemain =~ s/^\s+estimated CPU time remaining: //; |
||
| 179 | my @schedstat = grep /^\s+scheduler state: /,@lines; |
||
| 180 | my $schedstat = $schedstat[0]; |
||
| 181 | $schedstat =~ s/^\s+scheduler state: //; |
||
| 182 | my @state = grep /^\s+state: /,@lines; |
||
| 183 | my $state = $state[0]; |
||
| 184 | $state =~ s/^\s+state: //; |
||
| 185 | my @acttask = grep /^\s+active_task_state: /,@lines; |
||
| 186 | my $acttask = $acttask[0]; |
||
| 187 | $acttask =~ s/^\s+active_task_state: //; |
||
| 188 | my @suspGUI = grep /^\s+suspended via GUI: /,@lines; |
||
| 189 | my $suspGUI =$suspGUI[0]; |
||
| 190 | $suspGUI =~ s/^\s+suspended via GUI: //; |
||
| 191 | my @prjURL = grep /^\s+project URL: /,@lines; |
||
| 192 | my $prjURL =$prjURL[0]; |
||
| 193 | $prjURL =~ s/^\s+project URL: //; |
||
| 194 | if ($suspGUI eq "yes") {
|
||
| 195 | # This result is not in work cache - at the moment |
||
| 196 | next; |
||
| 197 | } |
||
| 198 | my @suspPRJ = grep /^$prjURL$/,@susp_projects; |
||
| 199 | if ($#suspPRJ == 0) {
|
||
| 200 | # This result is not in work cache - at the moment |
||
| 201 | next; |
||
| 202 | } |
||
| 203 | if ($state eq "2") {
|
||
| 204 | # RESULT_FILES_DOWNLOADED |
||
| 205 | if ( ($schedstat eq "0") || |
||
| 206 | ($schedstat eq "1") ) {
|
||
| 207 | # CPU_SCHED_UNINITIALIZED 0 |
||
| 208 | # Not started yet: result is available in work cache |
||
| 209 | # CPU_SCHED_PREEMPTED 1 |
||
| 210 | # preempted: result is available in work cache |
||
| 211 | push @rsltRemain,$estRemain; |
||
| 212 | next; |
||
| 213 | } |
||
| 214 | if ($schedstat eq "2") {
|
||
| 215 | # CPU_SCHED_SCHEDULED 2 |
||
| 216 | if ( ($acttask eq "1") || |
||
| 217 | ($acttask eq "0") || |
||
| 218 | ($acttask eq "9") ) {
|
||
| 219 | # PROCESS_EXECUTING 1 |
||
| 220 | # running |
||
| 221 | # PROCESS_UNINITIALIZED 0 |
||
| 222 | # PROCESS_SUSPENDED 9 |
||
| 223 | # suspended by "user active"/benchmark? |
||
| 224 | # available in work cache |
||
| 225 | push @rsltRemain,$estRemain; |
||
| 226 | next; |
||
| 227 | } |
||
| 228 | # other active-task-state - maybe failing/aborted WU |
||
| 229 | # => not in work cache |
||
| 230 | next; |
||
| 231 | } |
||
| 232 | # There should be no other scheduler state |
||
| 233 | next; |
||
| 234 | } |
||
| 235 | # RESULT_FILES_DOWNLOADING |
||
| 236 | # RESULT_COMPUTE_ERROR |
||
| 237 | # RESULT_FILES_UPLOADING |
||
| 238 | # RESULT_FILES_UPLOADED |
||
| 239 | # RESULT_ABORTED |
||
| 240 | # => not in work cache |
||
| 241 | } |
||
| 242 | |||
| 243 | ######################################################################### |
||
| 244 | # 7. Distribute remaining results per CPUs |
||
| 245 | # |
||
| 246 | # 7.a) Sort remaining results descending |
||
| 247 | my @sortRemain = sort {$b <=> $a} @rsltRemain;
|
||
| 248 | |||
| 249 | # 7.b) Assign to CPU with smallest workcache |
||
| 250 | my @CPUcache; |
||
| 251 | for (my $i = 0; $i < $nCPUs; ++$i) {
|
||
| 252 | $CPUcache[$i] = 0; |
||
| 253 | } |
||
| 254 | |||
| 255 | for my $length (@sortRemain) {
|
||
| 256 | # find CPU with smallest workcache: |
||
| 257 | my @sortedCPUs = sort {$a <=> $b} @CPUcache;
|
||
| 258 | $sortedCPUs[0] = $sortedCPUs[0] + $length; |
||
| 259 | @CPUcache = @sortedCPUs; |
||
| 260 | } |
||
| 261 | |||
| 262 | # At the end, sort CPUs descending |
||
| 263 | @CPUcache = sort {$b <=> $a} @CPUcache;
|
||
| 264 | |||
| 265 | ######################################################################### |
||
| 266 | # 8. Display output |
||
| 267 | # |
||
| 268 | |||
| 269 | # Convert from seconds to hours |
||
| 270 | printf "longest.value %.2f\n",$sortRemain[0]/3600; |
||
| 271 | for (my $i = 0; $i < $nCPUs; ++$i) {
|
||
| 272 | printf "cpu$i.value %.2f\n",$CPUcache[$i]/3600; |
||
| 273 | } |
||
| 274 | |||
| 275 | exit 0; |
||
| 276 | |||
| 277 | |||
| 278 | ######################################################################### |
||
| 279 | # perldoc section |
||
| 280 | |||
| 281 | =head1 NAME |
||
| 282 | |||
| 283 | boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs |
||
| 284 | |||
| 285 | =head1 APPLICABLE SYSTEMS |
||
| 286 | |||
| 287 | Linux machines running BOINC and munin-node |
||
| 288 | |||
| 289 | - or - |
||
| 290 | |||
| 291 | 17f78427 | Lars Kruse | Linux servers (running munin-node) used to collect data from other systems |
| 292 | f1cbf1ac | Palo M | which are running BOINC, but not running munin-node (e.g. non-Linux systems) |
| 293 | |||
| 294 | =head1 CONFIGURATION |
||
| 295 | |||
| 296 | Following configuration variables are supported: |
||
| 297 | |||
| 298 | =over 12 |
||
| 299 | |||
| 300 | =item B<boinccmd> |
||
| 301 | |||
| 302 | command-line control program (default: boinccmd) |
||
| 303 | |||
| 304 | =item B<host> |
||
| 305 | |||
| 306 | Host to query (default: none) |
||
| 307 | |||
| 308 | =item B<port> |
||
| 309 | |||
| 310 | GUI RPC port (default: none = use BOINC-default) |
||
| 311 | |||
| 312 | =item B<boincdir> |
||
| 313 | |||
| 314 | Directory containing appropriate file gui_rpc_auth.cfg (default: none) |
||
| 315 | |||
| 316 | =item B<estwk_warn> |
||
| 317 | |||
| 318 | Warning level - minimum estimated work (default: 24.00 hours) |
||
| 319 | |||
| 320 | =item B<password> |
||
| 321 | |||
| 322 | 17f78427 | Lars Kruse | Password for BOINC (default: none) |
| 323 | f1cbf1ac | Palo M | |
| 324 | =back |
||
| 325 | |||
| 326 | =head2 B<Security Consideration:> |
||
| 327 | |||
| 328 | 17f78427 | Lars Kruse | Using of variable B<password> poses a security risk. Even if the Munin |
| 329 | configuration file for this plugin containing BOINC-password is properly |
||
| 330 | protected, the password is exposed as environment variable and finally passed |
||
| 331 | to boinccmd as a parameter. It is therefore possible for local users of the |
||
| 332 | machine running this plugin to eavesdrop the BOINC password. |
||
| 333 | f1cbf1ac | Palo M | |
| 334 | 17f78427 | Lars Kruse | Using of variable password is therefore strongly discouraged and is left here |
| 335 | f1cbf1ac | Palo M | as a legacy option and for testing purposes. |
| 336 | |||
| 337 | 17f78427 | Lars Kruse | It should be always possible to use B<boincdir> variable instead - in such case |
| 338 | the file gui_rpc_auth.cfg is read by boinccmd binary directly. |
||
| 339 | If this plugin is used to fetch data from remote system, the gui_rpc_auth.cfg |
||
| 340 | can be copied to special directory in a secure way (e.g. via scp) and properly |
||
| 341 | f1cbf1ac | Palo M | protected by file permissions. |
| 342 | |||
| 343 | =head1 INTERPRETATION |
||
| 344 | |||
| 345 | 17f78427 | Lars Kruse | This plugin shows the estimated remaining computation time for all CPUs of |
| 346 | f1cbf1ac | Palo M | the machine and the estimated remaining computation time of longest workunit. |
| 347 | 17f78427 | Lars Kruse | The estimation is based on assumption that the workunits of different lengths |
| 348 | f1cbf1ac | Palo M | will be distributed to the CPUs evenly (which is not always the case). |
| 349 | |||
| 350 | 17f78427 | Lars Kruse | The warning level can be used to warn in forward about the risk of workunits |
| 351 | f1cbf1ac | Palo M | local cache depletion and BOINC client running out of the work. |
| 352 | 17f78427 | Lars Kruse | Although such warning can be achieved by configuring Munin master, there is |
| 353 | f1cbf1ac | Palo M | also this option to configure it on munin-node side. |
| 354 | |||
| 355 | =head1 EXAMPLES |
||
| 356 | |||
| 357 | =head2 Local BOINC Example |
||
| 358 | |||
| 359 | 17f78427 | Lars Kruse | BOINC is running on local machine. The BOINC binaries are installed in |
| 360 | f1cbf1ac | Palo M | F</opt/boinc/custom-6.10.1/>, the BOINC is running in directory |
| 361 | 17f78427 | Lars Kruse | F</usr/local/boinc/> under username boinc, group boinc and the password is used |
| 362 | f1cbf1ac | Palo M | to protect access to BOINC. |
| 363 | 17f78427 | Lars Kruse | Warning will be set when estimated work for any of CPUs will decrease under |
| 364 | f1cbf1ac | Palo M | 48 hours: |
| 365 | |||
| 366 | [boinc_*] |
||
| 367 | group boinc |
||
| 368 | env.boinccmd /opt/boinc/custom-6.10.1/boinccmd |
||
| 369 | env.boincdir /usr/local/boinc |
||
| 370 | env.warn 48 |
||
| 371 | |||
| 372 | =head2 Remote BOINC Example |
||
| 373 | |||
| 374 | 17f78427 | Lars Kruse | BOINC is running on 2 remote machines C<foo> and C<bar>. |
| 375 | On the local machine the binary of command-line interface is installed in |
||
| 376 | f1cbf1ac | Palo M | directory F</usr/local/bin/>. |
| 377 | 17f78427 | Lars Kruse | The BOINC password used on the remote machine C<foo> is stored in file |
| 378 | f1cbf1ac | Palo M | F</etc/munin/boinc/foo/gui_rpc_auth.cfg>. |
| 379 | 17f78427 | Lars Kruse | The BOINC password used on the remote machine C<bar> is stored in file |
| 380 | f1cbf1ac | Palo M | F</etc/munin/boinc/bar/gui_rpc_auth.cfg>. |
| 381 | 17f78427 | Lars Kruse | These files are owned and readable by root, readable by group munin and not |
| 382 | readable by others. |
||
| 383 | There are 2 symbolic links to this plugin created in the munin plugins |
||
| 384 | directory (usually F</etc/munin/plugins/>): F<snmp_foo_boincestwk> and |
||
| 385 | f1cbf1ac | Palo M | F<snmp_bar_boincestwk> |
| 386 | |||
| 387 | [snmp_foo_boinc*] |
||
| 388 | group munin |
||
| 389 | env.boinccmd /usr/local/bin/boinccmd |
||
| 390 | env.host foo |
||
| 391 | env.boincdir /etc/munin/boinc/foo |
||
| 392 | |||
| 393 | [snmp_bar_boinc*] |
||
| 394 | group munin |
||
| 395 | env.boinccmd /usr/local/bin/boinccmd |
||
| 396 | env.host bar |
||
| 397 | env.boincdir /etc/munin/boinc/bar |
||
| 398 | |||
| 399 | 17f78427 | Lars Kruse | This way the plugin can be used by Munin the same way as the Munin plugins |
| 400 | f1cbf1ac | Palo M | utilizng SNMP (although this plugin itself does not use SNMP). |
| 401 | |||
| 402 | =head1 BUGS |
||
| 403 | |||
| 404 | 17f78427 | Lars Kruse | The estimation is based on simple assumption, that longest workunits will be |
| 405 | processed first. This is the case when work is distributed evenly among CPUs. |
||
| 406 | But this is not always the case, because various deadlines for various |
||
| 407 | workunits may fire the "panic mode" of BOINC and scheduling could be much |
||
| 408 | different. |
||
| 409 | For example, there can be 4 CPUs, and BOINC having downloaded 4 workunits |
||
| 410 | with estimated run-time 1 hour each and 3 workunits with estimated run-time |
||
| 411 | 4 hours each. |
||
| 412 | f1cbf1ac | Palo M | This Munin plugin will report estimated work 4 hours for each CPU. |
| 413 | 17f78427 | Lars Kruse | But if deadline of those 1-hour workunits will be much shorter than deadline |
| 414 | of those 4-hours workunits, BOINC will schedule short workunits first (for all |
||
| 415 | f1cbf1ac | Palo M | 4 CPUs) and after finishing them it will schedule those long workunits. |
| 416 | This will result in real computation for 5 hours on 3 CPUs but only 1 hour on |
||
| 417 | 17f78427 | Lars Kruse | remaining 4th CPU. So after 1 hour of computation 1 of CPUs will run out of |
| 418 | f1cbf1ac | Palo M | work. |
| 419 | |||
| 420 | 17f78427 | Lars Kruse | There is no C<autoconf> capability at the moment. This is due to the fact, that |
| 421 | BOINC installations may vary over different systems, sometimes using default |
||
| 422 | directory from distribution (e.g. F</var/lib/boinc/> in Debian or Ubuntu), but |
||
| 423 | f1cbf1ac | Palo M | often running in user directories or in other separate directories. |
| 424 | 17f78427 | Lars Kruse | Also the user-ID under which BOINC runs often differs. |
| 425 | Under these circumstances the C<autoconf> would be either lame or too |
||
| 426 | f1cbf1ac | Palo M | complicated. |
| 427 | |||
| 428 | =head1 AUTHOR |
||
| 429 | |||
| 430 | Palo M. <palo.gm@gmail.com> |
||
| 431 | |||
| 432 | =head1 LICENSE |
||
| 433 | |||
| 434 | GPLv3 L<http://www.gnu.org/licenses/gpl-3.0.txt> |
||
| 435 | |||
| 436 | =cut |
||
| 437 | |||
| 438 | # vim:syntax=perl |
