Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / boinc / boinc_estwk @ 17f78427

Historique | Voir | Annoter | Télécharger (13,2 ko)

1
#!/usr/bin/perl -w
2
#
3
# boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs
4
#
5
# Run 'perldoc boinc_estwk' for full man page
6
#
7
# Author:  Palo M. <palo.gm@gmail.com>
8
# License: GPLv3 <http://www.gnu.org/licenses/gpl-3.0.txt>
9
#
10
#
11
# Parameters supported:
12
# 	config
13
#
14
#
15
# Configurable variables
16
#       boinccmd   - command-line control program (default: boinccmd)
17
# 	host       - Host to query (default: none)
18
#       port       - GUI RPC port (default: none = use BOINC-default)
19
#       boincdir   - Directory containing appropriate password file
20
#                    gui_rpc_auth.cfg (default: none)
21
#       estwk_warn - Warning level - minimum estimated work (default: 24.00 hours)
22
#       password   - Password for BOINC (default: none) !!! UNSAFE !!!
23
#
24
#
25
# $Log$
26
#
27
# Revision 1.0  2009/09/13  Palo M.
28
#   Add documentation and license information
29
#   Ready to publish on Munin Exchange
30
# Revision 0.9  2009/09/13  Palo M.
31
#   Add possibility to read password from file
32
# Revision 0.8  2009/09/12  Palo M.
33
#   Update default binary name: boinc_cmd -> boinccmd
34
# Revision 0.7  2008/08/30  Palo M.
35
#   Creation - Attempt to port functionality from C++ code
36
#
37
# (Revisions 0.1 - 0.6) were done in C++
38
#
39
#
40
#
41
# Magic markers:
42
#%# family=contrib
43

    
44
use strict;
45

    
46

    
47
#########################################################################
48
# 1. Parse configuration variables
49
#
50
my $BOINCCMD = exists $ENV{'boinccmd'} ? $ENV{'boinccmd'} : "boinccmd";
51
my $HOST = exists $ENV{'host'} ? $ENV{'host'} : undef;
52
my $PORT = exists $ENV{'port'} ? $ENV{'port'} : undef;
53
my $PASSWORD = exists $ENV{'password'} ? $ENV{'password'} : undef;
54
my $BOINCDIR = exists $ENV{'boincdir'} ? $ENV{'boincdir'} : undef;
55
my $ESTWKWRN = exists $ENV{'estwk_warn'} ? $ENV{'estwk_warn'} : 24;
56

    
57
#########################################################################
58
# 2. Basic executable
59
#
60
if (defined $HOST) {
61
  $BOINCCMD .= " --host $HOST";
62
  if (defined $PORT) {
63
    $BOINCCMD .= ":$PORT";
64
  }
65
}
66
if (defined $PASSWORD) {
67
  $BOINCCMD .= " --passwd $PASSWORD";
68
}
69
if (defined $BOINCDIR) {
70
  chdir $BOINCDIR;
71
}
72

    
73
#########################################################################
74
# 3. Get host info, to retrieve number of CPUs
75
#
76
my $nCPUs;
77
my $hostInfo = `$BOINCCMD --get_host_info 2>/dev/null`;
78
if ($hostInfo ne "") {
79
  my @hostInfo = split /\n/, $hostInfo;
80
  my @nCPUs = grep /^\s+#CPUS: /,@hostInfo;
81
  if ($#nCPUs != 0) { die "Unexpected output from boinccmd"; }
82
  $nCPUs = $nCPUs[0];
83
  $nCPUs =~ s/^\s+#CPUS: //;
84
  no warnings;  # for following line only
85
  if ($nCPUs < 1) { die "Unexpected output from boinccmd"; }
86
}
87
else {
88
  # No host info (e.g. client not running)
89
  exit -1;
90
}
91

    
92
#print "$nCPUs\n";
93

    
94
#########################################################################
95
# 4. Display config if applicable
96
#
97
if ( (defined $ARGV[0]) && ($ARGV[0] eq "config") ) {
98

    
99
  if (defined $HOST) {
100
    print "host_name $HOST\n";
101
  }
102

    
103
  print "graph_title BOINC work cache estimation\n";
104
  print "graph_category htc\n";
105
  print "graph_args --base 1000 -l 0 --alt-autoscale-max\n";
106
  print "graph_vlabel Hours\n";
107
  print "graph_scale no\n";
108

    
109
  # Longest WU is AREA, each CPU estimated is LINE2
110
  print "longest.label Longest WU\n";
111
  print "longest.draw AREA\n";
112
  print "longest.type GAUGE\n";
113
  for (my $i = 0; $i < $nCPUs; ++$i) {
114
    print "cpu$i.label CPU$i\n";
115
    print "cpu$i.draw LINE2\n";
116
    print "cpu$i.type GAUGE\n";
117
    printf "cpu$i.warning %.2f:\n",$ESTWKWRN;
118
    print "cpu$i.critical 0:\n";
119
  }
120

    
121
  exit 0;
122
}
123

    
124
#########################################################################
125
# 5. Fetch all needed data from BOINC-client with single call
126
#
127
my $prj_status = "";
128
my $results = "";
129

    
130
my $simpleGuiInfo = `$BOINCCMD --get_simple_gui_info 2>/dev/null`;
131
if ($simpleGuiInfo ne "") {
132
  # Some data were retrieved, so let's split them
133
  my @sections;
134
  my @section1;
135
  @sections = split /=+ Projects =+\n/, $simpleGuiInfo;
136
  @section1 = split /=+ [A-z]+ =+\n/, $sections[1];
137
  $prj_status = $section1[0];
138

    
139
  @sections = split /=+ Results =+\n/, $simpleGuiInfo;
140
  @section1 = split /=+ [A-z]+ =+\n/, $sections[1];
141
  $results = $section1[0];
142
}
143

    
144
#########################################################################
145
# 6. Parse BOINC data
146
#
147
# 6.a) Get suspended projects
148
my @prjInfos = split /\d+\) -+\n/, $prj_status;
149
shift @prjInfos; # Throw out first empty line
150

    
151
my @susp_projects;    # array of suspended projects
152
for my $prj_info (@prjInfos) {
153
  my @lines = split /\n/, $prj_info;
154
  my @prjURL = grep /^\s+master URL: /,@lines;
155
  if ($#prjURL != 0) {die "Unexpected output from boinccmd"; }
156
  my $prjURL =$prjURL[0];
157
  $prjURL =~ s/^\s+master URL: //;
158
  my @suspGUI = grep /^\s+suspended via GUI: /,@lines;
159
  if ($#suspGUI != 0) {die "Unexpected output from boinccmd"; }
160
  my $suspGUI =$suspGUI[0];
161
  $suspGUI =~ s/^\s+suspended via GUI: //;
162
  if ($suspGUI eq "yes") {
163
    push @susp_projects, $prjURL
164
  }
165
}
166
for my $i (@susp_projects) { print "$i\n"; }
167

    
168
# 6.b) Parse results, check their states
169
#      Get those which are NOT suspended by GUI
170
my @rsltInfos = split /\d+\) -+\n/, $results;
171
shift @rsltInfos; # Throw out first empty line
172
my @rsltRemain;
173

    
174
for my $rslt_info (@rsltInfos) {
175
  my @lines = split /\n/, $rslt_info;
176
  my @estRemain = grep /^\s+estimated CPU time remaining: /,@lines;
177
  my $estRemain = $estRemain[0];
178
  $estRemain =~ s/^\s+estimated CPU time remaining: //;
179
  my @schedstat = grep /^\s+scheduler state: /,@lines;
180
  my $schedstat = $schedstat[0];
181
  $schedstat =~ s/^\s+scheduler state: //;
182
  my @state = grep /^\s+state: /,@lines;
183
  my $state = $state[0];
184
  $state =~ s/^\s+state: //;
185
  my @acttask = grep /^\s+active_task_state: /,@lines;
186
  my $acttask = $acttask[0];
187
  $acttask =~ s/^\s+active_task_state: //;
188
  my @suspGUI = grep /^\s+suspended via GUI: /,@lines;
189
  my $suspGUI =$suspGUI[0];
190
  $suspGUI =~ s/^\s+suspended via GUI: //;
191
  my @prjURL = grep /^\s+project URL: /,@lines;
192
  my $prjURL =$prjURL[0];
193
  $prjURL =~ s/^\s+project URL: //;
194
  if ($suspGUI eq "yes") {
195
    # This result is not in work cache - at the moment
196
    next;
197
  }
198
  my @suspPRJ = grep /^$prjURL$/,@susp_projects;
199
  if ($#suspPRJ == 0) {
200
    # This result is not in work cache - at the moment
201
    next;
202
  }
203
  if ($state eq "2") {
204
    # RESULT_FILES_DOWNLOADED
205
    if ( ($schedstat eq "0") ||
206
	 ($schedstat eq "1")    ) {
207
      # CPU_SCHED_UNINITIALIZED   0
208
      #  Not started yet: result is available in work cache
209
      # CPU_SCHED_PREEMPTED       1
210
      #  preempted: result is available in work cache
211
      push @rsltRemain,$estRemain;
212
      next;
213
    }
214
    if ($schedstat eq "2") {
215
      # CPU_SCHED_SCHEDULED       2
216
      if ( ($acttask eq "1") ||
217
	   ($acttask eq "0") ||
218
	   ($acttask eq "9")    ) {
219
	# PROCESS_EXECUTING       1
220
	#  running
221
	# PROCESS_UNINITIALIZED   0
222
	# PROCESS_SUSPENDED       9
223
	#  suspended by "user active"/benchmark?
224
	# available in work cache
225
	push @rsltRemain,$estRemain;
226
	next;
227
      }
228
      # other active-task-state - maybe failing/aborted WU
229
      # => not in work cache
230
      next;
231
    }
232
    # There should be no other scheduler state
233
    next;
234
  }
235
  # RESULT_FILES_DOWNLOADING
236
  # RESULT_COMPUTE_ERROR
237
  # RESULT_FILES_UPLOADING
238
  # RESULT_FILES_UPLOADED
239
  # RESULT_ABORTED
240
  # => not in work cache
241
}
242

    
243
#########################################################################
244
# 7. Distribute remaining results per CPUs
245
#
246
# 7.a) Sort remaining results descending
247
my @sortRemain = sort {$b <=> $a} @rsltRemain;
248

    
249
# 7.b) Assign to CPU with smallest workcache
250
my @CPUcache;
251
for (my $i = 0; $i < $nCPUs; ++$i) {
252
  $CPUcache[$i] = 0;
253
}
254

    
255
for my $length (@sortRemain) {
256
  # find CPU with smallest workcache:
257
  my @sortedCPUs = sort {$a <=> $b} @CPUcache;
258
  $sortedCPUs[0] = $sortedCPUs[0] + $length;
259
  @CPUcache = @sortedCPUs;
260
}
261

    
262
# At the end, sort CPUs descending
263
@CPUcache = sort {$b <=> $a} @CPUcache;
264

    
265
#########################################################################
266
# 8. Display output
267
#
268

    
269
# Convert from seconds to hours
270
printf "longest.value %.2f\n",$sortRemain[0]/3600;
271
for (my $i = 0; $i < $nCPUs; ++$i) {
272
  printf "cpu$i.value %.2f\n",$CPUcache[$i]/3600;
273
}
274

    
275
exit 0;
276

    
277

    
278
#########################################################################
279
# perldoc section
280

    
281
=head1 NAME
282

    
283
boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs
284

    
285
=head1 APPLICABLE SYSTEMS
286

    
287
Linux machines running BOINC and munin-node
288

    
289
- or -
290

    
291
Linux servers (running munin-node) used to collect data from other systems
292
which are running BOINC, but not running munin-node (e.g. non-Linux systems)
293

    
294
=head1 CONFIGURATION
295

    
296
Following configuration variables are supported:
297

    
298
=over 12
299

    
300
=item B<boinccmd>
301

    
302
command-line control program (default: boinccmd)
303

    
304
=item B<host>
305

    
306
Host to query (default: none)
307

    
308
=item B<port>
309

    
310
GUI RPC port (default: none = use BOINC-default)
311

    
312
=item B<boincdir>
313

    
314
Directory containing appropriate file gui_rpc_auth.cfg (default: none)
315

    
316
=item B<estwk_warn>
317

    
318
Warning level - minimum estimated work (default: 24.00 hours)
319

    
320
=item B<password>
321

    
322
Password for BOINC (default: none)
323

    
324
=back
325

    
326
=head2 B<Security Consideration:>
327

    
328
Using of variable B<password> poses a security risk. Even if the Munin
329
configuration file for this plugin containing BOINC-password is properly
330
protected, the password is exposed as environment variable and finally passed
331
to boinccmd as a parameter. It is therefore possible for local users of the
332
machine running this plugin to eavesdrop the BOINC password.
333

    
334
Using of variable password is therefore strongly discouraged and is left here
335
as a legacy option and for testing purposes.
336

    
337
It should be always possible to use B<boincdir> variable instead - in such case
338
the file gui_rpc_auth.cfg is read by boinccmd binary directly.
339
If this plugin is used to fetch data from remote system, the gui_rpc_auth.cfg
340
can be copied to special directory in a secure way (e.g. via scp) and properly
341
protected by file permissions.
342

    
343
=head1 INTERPRETATION
344

    
345
This plugin shows the estimated remaining computation time for all CPUs of
346
the machine and the estimated remaining computation time of longest workunit.
347
The estimation is based on assumption that the workunits of different lengths
348
will be distributed to the CPUs evenly (which is not always the case).
349

    
350
The warning level can be used to warn in forward about the risk of workunits
351
local cache depletion and BOINC client running out of the work.
352
Although such warning can be achieved by configuring Munin master, there is
353
also this option to configure it on munin-node side.
354

    
355
=head1 EXAMPLES
356

    
357
=head2 Local BOINC Example
358

    
359
BOINC is running on local machine. The BOINC binaries are installed in
360
F</opt/boinc/custom-6.10.1/>, the BOINC is running in directory
361
F</usr/local/boinc/> under username boinc, group boinc and the password is used
362
to protect access to BOINC.
363
Warning will be set when estimated work for any of CPUs will decrease under
364
48 hours:
365

    
366
  [boinc_*]
367
  group boinc
368
  env.boinccmd /opt/boinc/custom-6.10.1/boinccmd
369
  env.boincdir /usr/local/boinc
370
  env.warn 48
371

    
372
=head2 Remote BOINC Example
373

    
374
BOINC is running on 2 remote machines C<foo> and C<bar>.
375
On the local machine the binary of command-line interface is installed in
376
directory F</usr/local/bin/>.
377
The BOINC password used on the remote machine C<foo> is stored in file
378
F</etc/munin/boinc/foo/gui_rpc_auth.cfg>.
379
The BOINC password used on the remote machine C<bar> is stored in file
380
F</etc/munin/boinc/bar/gui_rpc_auth.cfg>.
381
These files are owned and readable by root, readable by group munin and not
382
readable by others.
383
There are 2 symbolic links to this plugin created in the munin plugins
384
directory (usually F</etc/munin/plugins/>): F<snmp_foo_boincestwk> and
385
F<snmp_bar_boincestwk>
386

    
387
  [snmp_foo_boinc*]
388
  group munin
389
  env.boinccmd /usr/local/bin/boinccmd
390
  env.host foo
391
  env.boincdir /etc/munin/boinc/foo
392

    
393
  [snmp_bar_boinc*]
394
  group munin
395
  env.boinccmd /usr/local/bin/boinccmd
396
  env.host bar
397
  env.boincdir /etc/munin/boinc/bar
398

    
399
This way the plugin can be used by Munin the same way as the Munin plugins
400
utilizng SNMP (although this plugin itself does not use SNMP).
401

    
402
=head1 BUGS
403

    
404
The estimation is based on simple assumption, that longest workunits will be
405
processed first. This is the case when work is distributed evenly among CPUs.
406
But this is not always the case, because various deadlines for various
407
workunits may fire the "panic mode" of BOINC and scheduling could be much
408
different.
409
For example, there can be 4 CPUs, and BOINC having downloaded 4 workunits
410
with estimated run-time 1 hour each and 3 workunits with estimated run-time
411
4 hours each.
412
This Munin plugin will report estimated work 4 hours for each CPU.
413
But if deadline of those 1-hour workunits will be much shorter than deadline
414
of those 4-hours workunits, BOINC will schedule short workunits first (for all
415
4 CPUs) and after finishing them it will schedule those long workunits.
416
This will result in real computation for 5 hours on 3 CPUs but only 1 hour on
417
remaining 4th CPU. So after 1 hour of computation 1 of CPUs will run out of
418
work.
419

    
420
There is no C<autoconf> capability at the moment. This is due to the fact, that
421
BOINC installations may vary over different systems, sometimes using default
422
directory from distribution (e.g. F</var/lib/boinc/> in Debian or Ubuntu), but
423
often running in user directories or in other separate directories.
424
Also the user-ID under which BOINC runs often differs.
425
Under these circumstances the C<autoconf> would be either lame or too
426
complicated.
427

    
428
=head1 AUTHOR
429

    
430
Palo M. <palo.gm@gmail.com>
431

    
432
=head1 LICENSE
433

    
434
GPLv3 L<http://www.gnu.org/licenses/gpl-3.0.txt>
435

    
436
=cut
437

    
438
# vim:syntax=perl