root / plugins / nginx / nginx_upstream_multi_ @ a7139bca
Historique | Voir | Annoter | Télécharger (13,9 ko)
| 1 |
#!/usr/bin/env python3 |
|---|---|
| 2 |
# |
| 3 |
# Munin plugin to monitor requests number, cache statuses, http status codes and average request |
| 4 |
# times of specified nginx upstreams. |
| 5 |
# |
| 6 |
# Copyright Igor Borodikhin |
| 7 |
# |
| 8 |
# License : GPLv3 |
| 9 |
# |
| 10 |
# Configuration parameters: |
| 11 |
# env.graphs - which graphs to produce (optional, list of graphs separated by spaces, default - |
| 12 |
# cache http time request) |
| 13 |
# env.log - log file path (mandatory, ex.: /var/log/nginx/upstream.log) |
| 14 |
# env.upstream - list of upstreams to monitor (mandatory, including port numbers separated by |
| 15 |
# space, e.g.: 10.0.0.1:80 10.0.0.2:8080) |
| 16 |
# env.statuses - list of http status codes to monitor (optional, default - all statuses, |
| 17 |
# e.g.: 200 403 404 410 500 502) |
| 18 |
# env.percentiles - which percentiles to draw on time graphs (optional, list of percentiles |
| 19 |
# separated by spaces, default - 80) |
| 20 |
# |
| 21 |
# ## Installation |
| 22 |
# Copy file to directory /usr/share/munin/pligins/ and create symbolic link(s) for each log file |
| 23 |
# you wish to monitor. |
| 24 |
# |
| 25 |
# Specify log_format at /etc/nginx/conf.d/upstream.conf: |
| 26 |
# log_format upstream "ua=[$upstream_addr] ut=[$upstream_response_time] us=[$upstream_status] \ |
| 27 |
# cs=[$upstream_cache_status]" |
| 28 |
# |
| 29 |
# Use it in your site configuration (/etc/nginx/sites-enabled/anything.conf): |
| 30 |
# access_log /var/log/nginx/upstream.log upstream; |
| 31 |
# |
| 32 |
# Attention! Since the default user (nobody) does not have read permission for nginx log files we |
| 33 |
# need to run it as root. |
| 34 |
# |
| 35 |
# And specify some options in /etc/munin/plugin-conf.d/munin-node: |
| 36 |
# |
| 37 |
# [nginx_upstream_multi_upstream] |
| 38 |
# user root |
| 39 |
# env.graphs cache http time request |
| 40 |
# env.log /var/log/nginx/upstream.log |
| 41 |
# env.upstream 10.0.0.1:80 10.0.0.2:8080 unix:/tmp/upstream3 |
| 42 |
# env.statuses 200 403 404 410 500 502 |
| 43 |
# env.percentiles 50 80 |
| 44 |
# |
| 45 |
# #%# family=contrib |
| 46 |
|
| 47 |
import copy |
| 48 |
import math |
| 49 |
import os |
| 50 |
import re |
| 51 |
import sys |
| 52 |
import time |
| 53 |
|
| 54 |
|
| 55 |
# How we've been called |
| 56 |
progName = sys.argv[0] |
| 57 |
progName = progName[progName.rfind("/") + 1:]
|
| 58 |
|
| 59 |
|
| 60 |
# Where to store plugin state |
| 61 |
stateDir = os.environ.get("MUNIN_PLUGSTATE", None)
|
| 62 |
|
| 63 |
# Which site configuration we should use |
| 64 |
siteName = progName[len("nginx_upstream_multi_"):]
|
| 65 |
|
| 66 |
# Log path |
| 67 |
logPath = os.environ.get("log", "/var/log/nginx/access.log")
|
| 68 |
|
| 69 |
# Http statuses list |
| 70 |
httpStatusString = ( |
| 71 |
"100:Continue;101:Switching protocols;102:Processing;200:OK;201:Created;202:Accepted;" |
| 72 |
"203:Non-Authoritative Information;204:No content;205:Reset content;206:Partial content;" |
| 73 |
"207:Multi-status;226:IM used;300:Multiple choices;301:Moved permanently;" |
| 74 |
"302:Moved temporarily;303:See other;304:Not modified;305:Use proxy;307:Temporary redirect;" |
| 75 |
"400:Bad request;401:Unauthorized;402:Payment required;403:Forbidden;404:Not found;" |
| 76 |
"405:Method not allowed;406:Not acceptable;407:Proxy Authentication Required;" |
| 77 |
"408:Request timeout;409:Conflict;410:Gone;411:Length required;412:Precondition failed;" |
| 78 |
"413:Request entity too large;414:Request URI too large;415:Unsupported media type;" |
| 79 |
"416:Request range not satisfiable;417:Expectation failed;422:Unprocessable entity;" |
| 80 |
"423:Locked;424:Failed dependency;425:Unordered collection;426:Upgrade required;" |
| 81 |
"449:Retry with;456:Unrecoverable error;500:Internal server error;501:Not implemented;" |
| 82 |
"502:Bad gateway;503:Service unavailable;504:Gateway timeout;505:HTTP version not supported;" |
| 83 |
"506:Variant also negotiates;507:Insufficient storage;508:Loop detected;" |
| 84 |
"509:Bandwidth limit exceeded;510:Not extended") |
| 85 |
|
| 86 |
# an empty list of wanted statuses is interpreted as: all statuses |
| 87 |
statuses = os.environ.get("statuses", "").split()
|
| 88 |
|
| 89 |
httpStatusList = {}
|
| 90 |
for statusString in httpStatusString.split(";"):
|
| 91 |
[code, title] = statusString.split(":")
|
| 92 |
if len(statuses) > 0 and code in statuses or len(statuses) == 0: |
| 93 |
httpStatusList[code] = {
|
| 94 |
"title": title, |
| 95 |
"requests": 0 |
| 96 |
} |
| 97 |
|
| 98 |
cacheStatusList = {"MISS": 0, "BYPASS": 0, "EXPIRED": 0, "UPDATING": 0, "STALE": 0, "HIT": 0}
|
| 99 |
|
| 100 |
# Parse upstreams |
| 101 |
upstreams = {}
|
| 102 |
if "upstream" in os.environ: |
| 103 |
upstreamString = os.environ["upstream"] |
| 104 |
upstreamList = upstreamString.split() |
| 105 |
for upstream in upstreamList: |
| 106 |
upstreams[upstream] = {
|
| 107 |
"requests": 0, |
| 108 |
"time": 0, |
| 109 |
"times": [], |
| 110 |
"cache": copy.deepcopy(cacheStatusList), |
| 111 |
"http": copy.deepcopy(httpStatusList) |
| 112 |
} |
| 113 |
else: |
| 114 |
raise Exception("No upstreams specified")
|
| 115 |
|
| 116 |
percentiles = os.environ.get("percentiles", "80").split()
|
| 117 |
|
| 118 |
graphs_enabled = os.environ.get("graphs", "cache http time request").split()
|
| 119 |
|
| 120 |
now = int(time.time()) |
| 121 |
|
| 122 |
lastBytePath = os.path.join(stateDir, "nginx_upstream_multi_{}_lastByte.txt".format(siteName))
|
| 123 |
try: |
| 124 |
lastRun = os.path.getmtime(lastBytePath) |
| 125 |
except OSError: |
| 126 |
lastRun = now |
| 127 |
|
| 128 |
|
| 129 |
def sanitize(string): |
| 130 |
return string.replace(".", "_").replace(":", "_").replace("/", "_").replace("-", "_")
|
| 131 |
|
| 132 |
|
| 133 |
if len(sys.argv) == 2 and sys.argv[1] == "config": |
| 134 |
# Parent graph declaration |
| 135 |
print("multigraph nginx_upstream_multi_%s" % siteName.replace(".", "_"))
|
| 136 |
print("graph_title Requests number")
|
| 137 |
print("graph_vlabel rps")
|
| 138 |
print("graph_category webserver")
|
| 139 |
for upstream in upstreams.keys(): |
| 140 |
print("us%s_requests.label %s" % (sanitize(upstream), upstream))
|
| 141 |
|
| 142 |
# Requests graph declaration |
| 143 |
if "request" in graphs_enabled: |
| 144 |
for upstream in upstreams.keys(): |
| 145 |
print() |
| 146 |
print("multigraph nginx_upstream_multi_%s.%s_requests"
|
| 147 |
% (sanitize(siteName), sanitize(upstream))) |
| 148 |
print("graph_title Requests number - %s" % upstream)
|
| 149 |
print("graph_vlabel rps")
|
| 150 |
print("graph_category webserver")
|
| 151 |
print("us%s_requests.label %s" % (sanitize(upstream), upstream))
|
| 152 |
print() |
| 153 |
|
| 154 |
# Times graph declaration |
| 155 |
if "time" in graphs_enabled: |
| 156 |
for upstream in upstreams.keys(): |
| 157 |
print() |
| 158 |
print("multigraph nginx_upstream_multi_%s.%s_times"
|
| 159 |
% (sanitize(siteName), sanitize(upstream))) |
| 160 |
print("graph_title Request time - %s" % upstream)
|
| 161 |
print("graph_vlabel sec.")
|
| 162 |
print("graph_category webserver")
|
| 163 |
print("us%s_times.label average" % (sanitize(upstream)))
|
| 164 |
for percentile in percentiles: |
| 165 |
print("us%s_times_percentile_%s.label %s-percentile"
|
| 166 |
% (sanitize(upstream), percentile, percentile)) |
| 167 |
print() |
| 168 |
|
| 169 |
# HTTP Status codes graph declaration |
| 170 |
if "http" in graphs_enabled: |
| 171 |
for upstream in upstreams.keys(): |
| 172 |
print() |
| 173 |
print("multigraph nginx_upstream_multi_%s.%s_statuses"
|
| 174 |
% (sanitize(siteName), sanitize(upstream))) |
| 175 |
print("graph_title HTTP - %s" % upstream)
|
| 176 |
print("graph_vlabel rps")
|
| 177 |
print("graph_category webserver")
|
| 178 |
for status in sorted(httpStatusList.keys()): |
| 179 |
print("http%s_%s_status.label %s - %s"
|
| 180 |
% (status, sanitize(upstream), status, httpStatusList[status]["title"])) |
| 181 |
print() |
| 182 |
|
| 183 |
# Cache status graph declaration |
| 184 |
if "cache" in graphs_enabled: |
| 185 |
for upstream in upstreams.keys(): |
| 186 |
print() |
| 187 |
print("multigraph nginx_upstream_multi_%s.%s_cache"
|
| 188 |
% (sanitize(siteName), sanitize(upstream))) |
| 189 |
print("graph_title Cache - %s" % upstream)
|
| 190 |
print("graph_vlabel rps")
|
| 191 |
print("graph_category webserver")
|
| 192 |
for status in cacheStatusList: |
| 193 |
print("us%s_%s_cache.label %s" % (sanitize(status), sanitize(upstream), status))
|
| 194 |
print() |
| 195 |
else: |
| 196 |
timeElapsed = now - lastRun |
| 197 |
|
| 198 |
lastByteHandle = None |
| 199 |
|
| 200 |
try: |
| 201 |
lastByteHandle = open(lastBytePath, "r") |
| 202 |
lastByte = int(lastByteHandle.read()) |
| 203 |
except Exception: |
| 204 |
lastByte = 0 |
| 205 |
|
| 206 |
if lastByteHandle is not None: |
| 207 |
lastByteHandle.close() |
| 208 |
|
| 209 |
try: |
| 210 |
logHandle = open(logPath, "r") |
| 211 |
except Exception as e: |
| 212 |
print("Log file %s not readable: %s" % (logPath, e.strerror), file=sys.stderr)
|
| 213 |
sys.exit(1) |
| 214 |
|
| 215 |
try: |
| 216 |
logSize = int(os.path.getsize(logPath)) |
| 217 |
except ValueError: |
| 218 |
logSize = 0 |
| 219 |
|
| 220 |
if logSize < lastByte: |
| 221 |
lastByte = 0 |
| 222 |
|
| 223 |
regExp = re.compile(r"ua=\[(.*?)\]\s+ut=\[(.*?)\]\s+us=\[(.*?)\]\s+cs=\[(.*?)\]") |
| 224 |
|
| 225 |
logHandle.seek(lastByte) |
| 226 |
for line in logHandle: |
| 227 |
match = regExp.search(line) |
| 228 |
if (match): |
| 229 |
# Extract data |
| 230 |
address = match.group(1) |
| 231 |
request_time = match.group(2) |
| 232 |
status = match.group(3) |
| 233 |
cache = match.group(4) |
| 234 |
|
| 235 |
# Replace separators by space |
| 236 |
address = address.replace(",", " ")
|
| 237 |
address = address.replace(" : ", " ")
|
| 238 |
address = re.sub(r"\s+", " ", address) |
| 239 |
|
| 240 |
request_time = request_time.replace(",", " ")
|
| 241 |
request_time = request_time.replace(" : ", " ")
|
| 242 |
request_time = re.sub(r"\s+", " ", request_time) |
| 243 |
|
| 244 |
status = status.replace(",", " ")
|
| 245 |
status = status.replace(" : ", " ")
|
| 246 |
status = re.sub(r"\s+", " ", status) |
| 247 |
|
| 248 |
cache = cache.replace(",", " ")
|
| 249 |
cache = cache.replace(" : ", " ")
|
| 250 |
cache = re.sub(r"\s+", " ", cache) |
| 251 |
|
| 252 |
addresses = address.split() |
| 253 |
times = request_time.split() |
| 254 |
statuses = status.split() |
| 255 |
caches = cache.split() |
| 256 |
|
| 257 |
index = 0 |
| 258 |
for uAddress in addresses: |
| 259 |
if uAddress in upstreams.keys(): |
| 260 |
try: |
| 261 |
uTime = float(times[index]) |
| 262 |
except ValueError: |
| 263 |
uTime = 0 |
| 264 |
|
| 265 |
if index < len(statuses): |
| 266 |
uStatus = statuses[index] |
| 267 |
else: |
| 268 |
uStatus = "-" |
| 269 |
|
| 270 |
if index < len(caches): |
| 271 |
uCache = caches[index] |
| 272 |
else: |
| 273 |
uCache = "-" |
| 274 |
|
| 275 |
if uAddress != "-": |
| 276 |
upstreams[uAddress]["requests"] += 1 |
| 277 |
if uTime != "-": |
| 278 |
upstreams[uAddress]["time"] += uTime |
| 279 |
upstreams[uAddress]["times"].append(uTime) |
| 280 |
if uStatus != "-" and uStatus in upstreams[uAddress]["http"].keys(): |
| 281 |
upstreams[uAddress]["http"][uStatus]["requests"] += 1 |
| 282 |
if uCache != "-": |
| 283 |
upstreams[uAddress]["cache"][uCache] += 1 |
| 284 |
index += 1 |
| 285 |
|
| 286 |
try: |
| 287 |
lastByteHandle = open(lastBytePath, "w") |
| 288 |
lastByteHandle.write(str(logHandle.tell())) |
| 289 |
lastByteHandle.close() |
| 290 |
except Exception as e: |
| 291 |
print("Failed to write status file (%s): %s" % (lastBytePath, e.strerror), file=sys.stderr)
|
| 292 |
sys.exit(1) |
| 293 |
|
| 294 |
logHandle.close() |
| 295 |
|
| 296 |
# Parent graph data |
| 297 |
for upstream in upstreams.keys(): |
| 298 |
value = 0 |
| 299 |
if timeElapsed > 0: |
| 300 |
value = upstreams[upstream]["requests"] / timeElapsed |
| 301 |
|
| 302 |
print("us%s_requests.value %s" % (sanitize(upstream), value))
|
| 303 |
|
| 304 |
# Requests graph data |
| 305 |
if "request" in graphs_enabled: |
| 306 |
for upstream in upstreams.keys(): |
| 307 |
print() |
| 308 |
print("multigraph nginx_upstream_multi_%s.%s_requests"
|
| 309 |
% (sanitize(siteName), sanitize(upstream))) |
| 310 |
value = 0 |
| 311 |
if timeElapsed > 0: |
| 312 |
value = upstreams[upstream]["requests"] / timeElapsed |
| 313 |
print("us%s_requests.value %s" % (sanitize(upstream), value))
|
| 314 |
print() |
| 315 |
|
| 316 |
# Times graph data |
| 317 |
if "time" in graphs_enabled: |
| 318 |
for upstream in upstreams.keys(): |
| 319 |
uTime = 0 |
| 320 |
if upstreams[upstream]["requests"] > 0: |
| 321 |
uTime = upstreams[upstream]["time"] / upstreams[upstream]["requests"] |
| 322 |
upstreams[upstream]["times"].sort() |
| 323 |
print() |
| 324 |
print("multigraph nginx_upstream_multi_%s.%s_times"
|
| 325 |
% (sanitize(siteName), sanitize(upstream))) |
| 326 |
print("us%s_times.value %s" % (sanitize(upstream), uTime))
|
| 327 |
for percentile in percentiles: |
| 328 |
percentileValue = 0 |
| 329 |
if upstreams[upstream]["requests"] > 0: |
| 330 |
uTime = upstreams[upstream]["time"] / upstreams[upstream]["requests"] |
| 331 |
percentileKey = int(percentile) * len(upstreams[upstream]["times"]) / 100 |
| 332 |
if len(upstreams[upstream]["times"]) % 2 > 0: |
| 333 |
low = int(math.floor(percentileKey)) |
| 334 |
high = int(math.ceil(percentileKey)) |
| 335 |
percentileValue = (upstreams[upstream]["times"][low] |
| 336 |
+ upstreams[upstream]["times"][high]) / 2 |
| 337 |
else: |
| 338 |
percentileValue = upstreams[upstream]["times"][int(percentileKey)] |
| 339 |
print("us%s_times_percentile_%s.value %s"
|
| 340 |
% (sanitize(upstream), percentile, percentileValue)) |
| 341 |
print() |
| 342 |
|
| 343 |
# HTTP Status codes graph data |
| 344 |
if "http" in graphs_enabled: |
| 345 |
for upstream in upstreams.keys(): |
| 346 |
print() |
| 347 |
print("multigraph nginx_upstream_multi_%s.%s_statuses"
|
| 348 |
% (sanitize(siteName), sanitize(upstream))) |
| 349 |
for status in sorted(httpStatusList.keys()): |
| 350 |
value = 0 |
| 351 |
if timeElapsed > 0: |
| 352 |
value = upstreams[upstream]["http"][status]["requests"] / timeElapsed |
| 353 |
|
| 354 |
print("http%s_%s_status.value %s" % (status, sanitize(upstream), value))
|
| 355 |
print() |
| 356 |
|
| 357 |
# Cache status graph data |
| 358 |
if "cache" in graphs_enabled: |
| 359 |
for upstream in upstreams.keys(): |
| 360 |
print() |
| 361 |
print("multigraph nginx_upstream_multi_%s.%s_cache"
|
| 362 |
% (sanitize(siteName), sanitize(upstream))) |
| 363 |
for status in cacheStatusList: |
| 364 |
value = 0 |
| 365 |
if timeElapsed > 0: |
| 366 |
value = upstreams[upstream]["cache"][status] / timeElapsed |
| 367 |
|
| 368 |
print("us%s_%s_cache.value %s" % (sanitize(status), sanitize(upstream), value))
|
| 369 |
print() |
