Performance improvements for restic-exporter
This commit is contained in:
parent
880b47131c
commit
60617651cb
3 changed files with 221 additions and 98 deletions
|
@ -1,5 +1,13 @@
|
||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## next_release
|
||||||
|
|
||||||
|
* Added EXIT_ON_ERROR env var - now is possible to control exit_on_errors
|
||||||
|
behaviour
|
||||||
|
* Added NO_CHECK env var, now is possible not perform restic check operation
|
||||||
|
* Added NO_STATS env var, now is possible not collect per backup stats
|
||||||
|
* Added backup tag to the metric labels (if tags is present)
|
||||||
|
|
||||||
## 1.1.0 (2023/02/02)
|
## 1.1.0 (2023/02/02)
|
||||||
|
|
||||||
* Update Restic 0.15.1
|
* Update Restic 0.15.1
|
||||||
|
|
44
README.md
44
README.md
|
@ -97,38 +97,56 @@ All configuration is done with environment variables:
|
||||||
* Amazon S3: `s3:s3.amazonaws.com/bucket_name`
|
* Amazon S3: `s3:s3.amazonaws.com/bucket_name`
|
||||||
* Backblaze B2: `b2:bucketname:path/to/repo`
|
* Backblaze B2: `b2:bucketname:path/to/repo`
|
||||||
|
|
||||||
- `RESTIC_REPO_PASSWORD`: Restic repository password in plain text. This is only required if `RESTIC_REPO_PASSWORD_FILE` is not defined.
|
- `RESTIC_REPO_PASSWORD`: Restic repository password in plain text. This is only
|
||||||
- `RESTIC_REPO_PASSWORD_FILE`: File with the Restic repository password in plain text. This is only required if `RESTIC_REPO_PASSWORD` is not defined. Remember to mount the Docker volume with the file.
|
required if `RESTIC_REPO_PASSWORD_FILE` is not defined.
|
||||||
- `AWS_ACCESS_KEY_ID`: (Optional) Required for Amazon S3, Minio and Wasabi backends.
|
- `RESTIC_REPO_PASSWORD_FILE`: File with the Restic repository password in plain
|
||||||
- `AWS_SECRET_ACCESS_KEY`: (Optional) Required for Amazon S3, Minio and Wasabi backends.
|
text. This is only required if `RESTIC_REPO_PASSWORD` is not defined. Remember
|
||||||
|
to mount the Docker volume with the file.
|
||||||
|
- `AWS_ACCESS_KEY_ID`: (Optional) Required for Amazon S3, Minio and Wasabi
|
||||||
|
backends.
|
||||||
|
- `AWS_SECRET_ACCESS_KEY`: (Optional) Required for Amazon S3, Minio and Wasabi
|
||||||
|
backends.
|
||||||
- `B2_ACCOUNT_ID`: (Optional) Required for Backblaze B2 backend.
|
- `B2_ACCOUNT_ID`: (Optional) Required for Backblaze B2 backend.
|
||||||
- `B2_ACCOUNT_KEY`: (Optional) Required for Backblaze B2 backend.
|
- `B2_ACCOUNT_KEY`: (Optional) Required for Backblaze B2 backend.
|
||||||
- `REFRESH_INTERVAL`: (Optional) Refresh interval for the metrics in seconds. Computing the metrics is a expensive task, keep this value as high as possible. Default 60
|
- `REFRESH_INTERVAL`: (Optional) Refresh interval for the metrics in seconds.
|
||||||
- `LISTEN_PORT`: (Optional) The address the exporter should listen on. The default is `8001`.
|
Computing the metrics is a expensive task, keep this value as high as possible.
|
||||||
- `LISTEN_ADDRESS`: (Optional) The address the exporter should listen on. The default is to listen on all addresses.
|
Default is `60` seconds.
|
||||||
|
- `LISTEN_PORT`: (Optional) The address the exporter should listen on. The
|
||||||
|
default is `8001`.
|
||||||
|
- `LISTEN_ADDRESS`: (Optional) The address the exporter should listen on. The
|
||||||
|
default is to listen on all addresses.
|
||||||
- `LOG_LEVEL`: (Optional) Log level of the traces. The default is `INFO`.
|
- `LOG_LEVEL`: (Optional) Log level of the traces. The default is `INFO`.
|
||||||
|
- `EXIT_ON_ERROR`: (Optional) Shutdown exporter on any `restic` error. Default
|
||||||
|
is `Flase` (only log error, such as network error with Cloud backends).
|
||||||
|
- `NO_CHECK`: (Optional) Do not perform `restic check` operation for performance
|
||||||
|
reasons. Default is `False` (perform `restic check`).
|
||||||
|
- `NO_STATS`: (Optional) Do not collect per backup statistics for performance
|
||||||
|
reasons. Default is `False` (collect per backup statistics).
|
||||||
|
|
||||||
## Exported metrics
|
## Exported metrics
|
||||||
|
|
||||||
```shell
|
```python
|
||||||
# HELP restic_check_success Result of restic check operation in the repository
|
# HELP restic_check_success Result of restic check operation in the repository
|
||||||
# TYPE restic_check_success gauge
|
# TYPE restic_check_success gauge
|
||||||
restic_check_success 1.0
|
restic_check_success 1.0
|
||||||
# HELP restic_snapshots_total Total number of snapshots in the repository
|
# HELP restic_snapshots_total Total number of snapshots in the repository
|
||||||
# TYPE restic_snapshots_total counter
|
# TYPE restic_snapshots_total counter
|
||||||
restic_snapshots_total 1777.0
|
restic_snapshots_total 100.0
|
||||||
# HELP restic_backup_timestamp Timestamp of the last backup
|
# HELP restic_backup_timestamp Timestamp of the last backup
|
||||||
# TYPE restic_backup_timestamp gauge
|
# TYPE restic_backup_timestamp gauge
|
||||||
restic_backup_timestamp{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 1.669754009e+09
|
restic_backup_timestamp{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 1.666273638e+09
|
||||||
# HELP restic_backup_files_total Number of files in the backup
|
# HELP restic_backup_files_total Number of files in the backup
|
||||||
# TYPE restic_backup_files_total counter
|
# TYPE restic_backup_files_total counter
|
||||||
restic_backup_files_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 19051.0
|
restic_backup_files_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 8.0
|
||||||
# HELP restic_backup_size_total Total size of backup in bytes
|
# HELP restic_backup_size_total Total size of backup in bytes
|
||||||
# TYPE restic_backup_size_total counter
|
# TYPE restic_backup_size_total counter
|
||||||
restic_backup_size_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 4.1174838248e+010
|
restic_backup_size_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 4.3309562e+07
|
||||||
# HELP restic_backup_snapshots_total Total number of snapshots
|
# HELP restic_backup_snapshots_total Total number of snapshots
|
||||||
# TYPE restic_backup_snapshots_total counter
|
# TYPE restic_backup_snapshots_total counter
|
||||||
restic_backup_snapshots_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 106.0
|
restic_backup_snapshots_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 1.0
|
||||||
|
# HELP restic_scrape_duration_seconds Ammount of time each scrape takes
|
||||||
|
# TYPE restic_scrape_duration_seconds gauge
|
||||||
|
restic_scrape_duration_seconds 166.9411084651947
|
||||||
```
|
```
|
||||||
|
|
||||||
## Prometheus config
|
## Prometheus config
|
||||||
|
|
|
@ -10,20 +10,27 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import prometheus_client
|
from prometheus_client import start_http_server
|
||||||
import prometheus_client.core
|
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
|
||||||
|
|
||||||
|
|
||||||
class ResticCollector(object):
|
class ResticCollector(object):
|
||||||
def __init__(self, repository, password_file):
|
def __init__(
|
||||||
|
self, repository, password_file, exit_on_error, disable_check, disable_stats
|
||||||
|
):
|
||||||
self.repository = repository
|
self.repository = repository
|
||||||
self.password_file = password_file
|
self.password_file = password_file
|
||||||
|
self.exit_on_error = exit_on_error
|
||||||
|
self.disable_check = disable_check
|
||||||
|
self.disable_stats = disable_stats
|
||||||
# todo: the stats cache increases over time -> remove old ids
|
# todo: the stats cache increases over time -> remove old ids
|
||||||
# todo: cold start -> the stats cache could be saved in a persistent volume
|
# todo: cold start -> the stats cache could be saved in a persistent
|
||||||
# todo: cold start -> the restic cache (/root/.cache/restic) could be saved in a persistent volume
|
# volume
|
||||||
|
# todo: cold start -> the restic cache (/root/.cache/restic) could be
|
||||||
|
# saved in a persistent volume
|
||||||
self.stats_cache = {}
|
self.stats_cache = {}
|
||||||
self.metrics = {}
|
self.metrics = {}
|
||||||
self.refresh(True)
|
self.refresh(exit_on_error)
|
||||||
|
|
||||||
def collect(self):
|
def collect(self):
|
||||||
logging.debug("Incoming request")
|
logging.debug("Incoming request")
|
||||||
|
@ -31,52 +38,71 @@ class ResticCollector(object):
|
||||||
common_label_names = [
|
common_label_names = [
|
||||||
"client_hostname",
|
"client_hostname",
|
||||||
"client_username",
|
"client_username",
|
||||||
"snapshot_hash"
|
"snapshot_hash",
|
||||||
|
"snapshot_tag",
|
||||||
]
|
]
|
||||||
|
|
||||||
check_success = prometheus_client.core.GaugeMetricFamily(
|
check_success = GaugeMetricFamily(
|
||||||
"restic_check_success",
|
"restic_check_success",
|
||||||
"Result of restic check operation in the repository",
|
"Result of restic check operation in the repository",
|
||||||
labels=[])
|
labels=[],
|
||||||
|
)
|
||||||
|
|
||||||
snapshots_total = prometheus_client.core.CounterMetricFamily(
|
snapshots_total = CounterMetricFamily(
|
||||||
"restic_snapshots_total",
|
"restic_snapshots_total",
|
||||||
"Total number of snapshots in the repository",
|
"Total number of snapshots in the repository",
|
||||||
labels=[])
|
labels=[],
|
||||||
|
)
|
||||||
|
|
||||||
backup_timestamp = prometheus_client.core.GaugeMetricFamily(
|
backup_timestamp = GaugeMetricFamily(
|
||||||
"restic_backup_timestamp",
|
"restic_backup_timestamp",
|
||||||
"Timestamp of the last backup",
|
"Timestamp of the last backup",
|
||||||
labels=common_label_names)
|
labels=common_label_names,
|
||||||
|
)
|
||||||
|
|
||||||
backup_files_total = prometheus_client.core.CounterMetricFamily(
|
backup_files_total = CounterMetricFamily(
|
||||||
"restic_backup_files_total",
|
"restic_backup_files_total",
|
||||||
"Number of files in the backup",
|
"Number of files in the backup",
|
||||||
labels=common_label_names)
|
labels=common_label_names,
|
||||||
|
)
|
||||||
|
|
||||||
backup_size_total = prometheus_client.core.CounterMetricFamily(
|
backup_size_total = CounterMetricFamily(
|
||||||
"restic_backup_size_total",
|
"restic_backup_size_total",
|
||||||
"Total size of backup in bytes",
|
"Total size of backup in bytes",
|
||||||
labels=common_label_names)
|
labels=common_label_names,
|
||||||
|
)
|
||||||
|
|
||||||
backup_snapshots_total = prometheus_client.core.CounterMetricFamily(
|
backup_snapshots_total = CounterMetricFamily(
|
||||||
"restic_backup_snapshots_total",
|
"restic_backup_snapshots_total",
|
||||||
"Total number of snapshots",
|
"Total number of snapshots",
|
||||||
labels=common_label_names)
|
labels=common_label_names,
|
||||||
|
)
|
||||||
|
|
||||||
|
scrape_duration_seconds = GaugeMetricFamily(
|
||||||
|
"restic_scrape_duration_seconds",
|
||||||
|
"Ammount of time each scrape takes",
|
||||||
|
labels=[],
|
||||||
|
)
|
||||||
|
|
||||||
check_success.add_metric([], self.metrics["check_success"])
|
check_success.add_metric([], self.metrics["check_success"])
|
||||||
snapshots_total.add_metric([], self.metrics["snapshots_total"])
|
snapshots_total.add_metric([], self.metrics["snapshots_total"])
|
||||||
|
|
||||||
for client in self.metrics['clients']:
|
for client in self.metrics["clients"]:
|
||||||
common_label_values = [
|
common_label_values = [
|
||||||
client["hostname"],
|
client["hostname"],
|
||||||
client["username"],
|
client["username"],
|
||||||
client["snapshot_hash"]
|
client["snapshot_hash"],
|
||||||
|
client["snapshot_tag"],
|
||||||
]
|
]
|
||||||
|
|
||||||
backup_timestamp.add_metric(common_label_values, client["timestamp"])
|
backup_timestamp.add_metric(common_label_values, client["timestamp"])
|
||||||
backup_files_total.add_metric(common_label_values, client["files_total"])
|
backup_files_total.add_metric(common_label_values, client["files_total"])
|
||||||
backup_size_total.add_metric(common_label_values, client["size_total"])
|
backup_size_total.add_metric(common_label_values, client["size_total"])
|
||||||
backup_snapshots_total.add_metric(common_label_values, client["snapshots_total"])
|
backup_snapshots_total.add_metric(
|
||||||
|
common_label_values, client["snapshots_total"]
|
||||||
|
)
|
||||||
|
|
||||||
|
scrape_duration_seconds.add_metric([], self.metrics["duration"])
|
||||||
|
|
||||||
yield check_success
|
yield check_success
|
||||||
yield snapshots_total
|
yield snapshots_total
|
||||||
|
@ -84,75 +110,117 @@ class ResticCollector(object):
|
||||||
yield backup_files_total
|
yield backup_files_total
|
||||||
yield backup_size_total
|
yield backup_size_total
|
||||||
yield backup_snapshots_total
|
yield backup_snapshots_total
|
||||||
|
yield scrape_duration_seconds
|
||||||
|
|
||||||
def refresh(self, exit_on_error=False):
|
def refresh(self, exit_on_error=False):
|
||||||
try:
|
try:
|
||||||
self.metrics = self.get_metrics()
|
self.metrics = self.get_metrics()
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.error("Unable to collect metrics from Restic. %s", traceback.format_exc(0).replace("\n", " "))
|
logging.error(
|
||||||
|
"Unable to collect metrics from Restic. %s",
|
||||||
|
traceback.format_exc(0).replace("\n", " "),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Shutdown exporter for any error
|
||||||
if exit_on_error:
|
if exit_on_error:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
def get_metrics(self):
|
def get_metrics(self):
|
||||||
|
duration = time.time()
|
||||||
all_snapshots = self.get_snapshots()
|
all_snapshots = self.get_snapshots()
|
||||||
latest_snapshots = self.get_snapshots(True)
|
latest_snapshots = self.get_snapshots(True)
|
||||||
clients = []
|
clients = []
|
||||||
for snap in latest_snapshots:
|
for snap in latest_snapshots:
|
||||||
stats = self.get_stats(snap['id'])
|
# Collect stats for each snap only if enabled
|
||||||
|
if self.disable_stats:
|
||||||
|
# return zero as "no-stats" value
|
||||||
|
stats = {
|
||||||
|
"total_size": -1,
|
||||||
|
"total_file_count": -1,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
stats = self.get_stats(snap["id"])
|
||||||
|
|
||||||
time_parsed = re.sub(r'\.[^+-]+', '', snap['time'])
|
# use first element of tags if tags is present
|
||||||
|
if "tags" in snap:
|
||||||
|
tag = snap["tags"][0]
|
||||||
|
else:
|
||||||
|
tag = ""
|
||||||
|
|
||||||
|
time_parsed = re.sub(r"\.[^+-]+", "", snap["time"])
|
||||||
if len(time_parsed) > 19:
|
if len(time_parsed) > 19:
|
||||||
# restic 14: '2023-01-12T06:59:33.1576588+01:00' -> '2023-01-12T06:59:33+01:00'
|
# restic 14: '2023-01-12T06:59:33.1576588+01:00' ->
|
||||||
|
# '2023-01-12T06:59:33+01:00'
|
||||||
time_format = "%Y-%m-%dT%H:%M:%S%z"
|
time_format = "%Y-%m-%dT%H:%M:%S%z"
|
||||||
else:
|
else:
|
||||||
# restic 12: '2023-02-01T14:14:19.30760523Z' -> '2023-02-01T14:14:19'
|
# restic 12: '2023-02-01T14:14:19.30760523Z' ->
|
||||||
|
# '2023-02-01T14:14:19'
|
||||||
time_format = "%Y-%m-%dT%H:%M:%S"
|
time_format = "%Y-%m-%dT%H:%M:%S"
|
||||||
timestamp = time.mktime(datetime.datetime.strptime(time_parsed, time_format).timetuple())
|
timestamp = time.mktime(
|
||||||
|
datetime.datetime.strptime(time_parsed, time_format).timetuple()
|
||||||
|
)
|
||||||
|
|
||||||
snapshots_total = 0
|
snapshots_total = 0
|
||||||
for snap2 in all_snapshots:
|
for snap2 in all_snapshots:
|
||||||
if snap2['hash'] == snap['hash']:
|
if snap2["hash"] == snap["hash"]:
|
||||||
snapshots_total += 1
|
snapshots_total += 1
|
||||||
|
|
||||||
clients.append({
|
clients.append(
|
||||||
'snapshot_hash': snap['hash'],
|
{
|
||||||
'hostname': snap['hostname'],
|
"hostname": snap["hostname"],
|
||||||
'username': snap['username'],
|
"username": snap["username"],
|
||||||
'timestamp': timestamp,
|
"snapshot_hash": snap["hash"],
|
||||||
'size_total': stats['total_size'],
|
"snapshot_tag": tag,
|
||||||
'files_total': stats['total_file_count'],
|
"timestamp": timestamp,
|
||||||
'snapshots_total': snapshots_total
|
"size_total": stats["total_size"],
|
||||||
})
|
"files_total": stats["total_file_count"],
|
||||||
|
"snapshots_total": snapshots_total,
|
||||||
|
}
|
||||||
|
)
|
||||||
# todo: fix the commented code when the bug is fixed in restic
|
# todo: fix the commented code when the bug is fixed in restic
|
||||||
# https://github.com/restic/restic/issues/2126
|
# https://github.com/restic/restic/issues/2126
|
||||||
# stats = self.get_stats()
|
# stats = self.get_stats()
|
||||||
check_success = self.get_check()
|
|
||||||
|
if self.disable_check:
|
||||||
|
# return 2 as "no-check" value
|
||||||
|
check_success = 2
|
||||||
|
else:
|
||||||
|
check_success = self.get_check()
|
||||||
|
|
||||||
metrics = {
|
metrics = {
|
||||||
'check_success': check_success,
|
"check_success": check_success,
|
||||||
'clients': clients,
|
"clients": clients,
|
||||||
|
"snapshots_total": len(all_snapshots),
|
||||||
|
"duration": time.time() - duration
|
||||||
# 'size_total': stats['total_size'],
|
# 'size_total': stats['total_size'],
|
||||||
# 'files_total': stats['total_file_count'],
|
# 'files_total': stats['total_file_count'],
|
||||||
'snapshots_total': len(all_snapshots)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
def get_snapshots(self, only_latest=False):
|
def get_snapshots(self, only_latest=False):
|
||||||
cmd = [
|
cmd = [
|
||||||
'restic',
|
"restic",
|
||||||
'-r', self.repository,
|
"-r",
|
||||||
'-p', self.password_file,
|
self.repository,
|
||||||
'--no-lock',
|
"-p",
|
||||||
'snapshots', '--json'
|
self.password_file,
|
||||||
|
"--no-lock",
|
||||||
|
"snapshots",
|
||||||
|
"--json",
|
||||||
]
|
]
|
||||||
|
|
||||||
if only_latest:
|
if only_latest:
|
||||||
cmd.extend(['--latest', '1'])
|
cmd.extend(["--latest", "1"])
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
raise Exception("Error executing restic snapshot command. " + self.parse_stderr(result))
|
raise Exception(
|
||||||
snapshots = json.loads(result.stdout.decode('utf-8'))
|
"Error executing restic snapshot command: " + self.parse_stderr(result)
|
||||||
|
)
|
||||||
|
snapshots = json.loads(result.stdout.decode("utf-8"))
|
||||||
for snap in snapshots:
|
for snap in snapshots:
|
||||||
snap['hash'] = self.calc_snapshot_hash(snap)
|
snap["hash"] = self.calc_snapshot_hash(snap)
|
||||||
return snapshots
|
return snapshots
|
||||||
|
|
||||||
def get_stats(self, snapshot_id=None):
|
def get_stats(self, snapshot_id=None):
|
||||||
|
@ -163,19 +231,24 @@ class ResticCollector(object):
|
||||||
return self.stats_cache[snapshot_id]
|
return self.stats_cache[snapshot_id]
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
'restic',
|
"restic",
|
||||||
'-r', self.repository,
|
"-r",
|
||||||
'-p', self.password_file,
|
self.repository,
|
||||||
'--no-lock',
|
"-p",
|
||||||
'stats', '--json'
|
self.password_file,
|
||||||
|
"--no-lock",
|
||||||
|
"stats",
|
||||||
|
"--json",
|
||||||
]
|
]
|
||||||
if snapshot_id is not None:
|
if snapshot_id is not None:
|
||||||
cmd.extend([snapshot_id])
|
cmd.extend([snapshot_id])
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
raise Exception("Error executing restic stats command. " + self.parse_stderr(result))
|
raise Exception(
|
||||||
stats = json.loads(result.stdout.decode('utf-8'))
|
"Error executing restic stats command: " + self.parse_stderr(result)
|
||||||
|
)
|
||||||
|
stats = json.loads(result.stdout.decode("utf-8"))
|
||||||
|
|
||||||
if snapshot_id is not None:
|
if snapshot_id is not None:
|
||||||
self.stats_cache[snapshot_id] = stats
|
self.stats_cache[snapshot_id] = stats
|
||||||
|
@ -185,62 +258,86 @@ class ResticCollector(object):
|
||||||
def get_check(self):
|
def get_check(self):
|
||||||
# This command takes 20 seconds or more, but it's required
|
# This command takes 20 seconds or more, but it's required
|
||||||
cmd = [
|
cmd = [
|
||||||
'restic',
|
"restic",
|
||||||
'-r', self.repository,
|
"-r",
|
||||||
'-p', self.password_file,
|
self.repository,
|
||||||
'--no-lock',
|
"-p",
|
||||||
'check'
|
self.password_file,
|
||||||
|
"--no-lock",
|
||||||
|
"check",
|
||||||
]
|
]
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
return 1 # ok
|
return 1 # ok
|
||||||
logging.warning("Error checking the repository health. " + self.parse_stderr(result))
|
else:
|
||||||
return 0 # error
|
logging.warning(
|
||||||
|
"Error checking the repository health. " + self.parse_stderr(result)
|
||||||
|
)
|
||||||
|
return 0 # error
|
||||||
|
|
||||||
def calc_snapshot_hash(self, snapshot: dict) -> str:
|
def calc_snapshot_hash(self, snapshot: dict) -> str:
|
||||||
text = snapshot['hostname'] + ",".join(snapshot['paths'])
|
text = snapshot["hostname"] + ",".join(snapshot["paths"])
|
||||||
return hashlib.sha256(text.encode('utf-8')).hexdigest()
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def parse_stderr(self, result):
|
def parse_stderr(self, result):
|
||||||
return result.stderr.decode('utf-8').replace("\n", " ") + " Exit code: " + str(result.returncode)
|
return (
|
||||||
|
result.stderr.decode("utf-8").replace("\n", " ")
|
||||||
|
+ " Exit code: "
|
||||||
|
+ str(result.returncode)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format='%(asctime)s %(levelname)-8s %(message)s',
|
format="%(asctime)s %(levelname)-8s %(message)s",
|
||||||
level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")),
|
level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")),
|
||||||
datefmt='%Y-%m-%d %H:%M:%S',
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
handlers=[
|
handlers=[logging.StreamHandler(sys.stdout)],
|
||||||
logging.StreamHandler(sys.stdout)
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
logging.info("Starting Restic Prometheus Exporter ...")
|
logging.info("Starting Restic Prometheus Exporter")
|
||||||
logging.info("It could take a while if the repository is remote.")
|
logging.info("It could take a while if the repository is remote")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
restic_repo_url = os.environ["RESTIC_REPO_URL"]
|
restic_repo_url = os.environ["RESTIC_REPO_URL"]
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.error("Configuration error. The environment variable RESTIC_REPO_URL is mandatory")
|
logging.error("The environment variable RESTIC_REPO_URL is mandatory")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
restic_repo_password_file = os.environ["RESTIC_REPO_PASSWORD_FILE"]
|
restic_repo_password_file = os.environ["RESTIC_REPO_PASSWORD_FILE"]
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.error("Configuration error. The environment variable RESTIC_REPO_PASSWORD_FILE is mandatory")
|
logging.error("The environment variable RESTIC_REPO_PASSWORD_FILE is mandatory")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
exporter_address = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
|
exporter_address = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
|
||||||
exporter_port = int(os.environ.get("LISTEN_PORT", 8001))
|
exporter_port = int(os.environ.get("LISTEN_PORT", 8001))
|
||||||
exporter_refresh_interval = int(os.environ.get("REFRESH_INTERVAL", 60))
|
exporter_refresh_interval = int(os.environ.get("REFRESH_INTERVAL", 60))
|
||||||
|
exporter_exit_on_error = bool(os.environ.get("EXIT_ON_ERROR", False))
|
||||||
|
exporter_disable_check = bool(os.environ.get("NO_CHECK", False))
|
||||||
|
exporter_disable_stats = bool(os.environ.get("NO_STATS", False))
|
||||||
|
|
||||||
collector = ResticCollector(restic_repo_url, restic_repo_password_file)
|
try:
|
||||||
|
collector = ResticCollector(
|
||||||
|
restic_repo_url,
|
||||||
|
restic_repo_password_file,
|
||||||
|
exporter_exit_on_error,
|
||||||
|
exporter_disable_check,
|
||||||
|
exporter_disable_stats,
|
||||||
|
)
|
||||||
|
REGISTRY.register(collector)
|
||||||
|
start_http_server(exporter_port, exporter_address)
|
||||||
|
logging.info(
|
||||||
|
"Serving at http://{0}:{1}".format(exporter_address, exporter_port)
|
||||||
|
)
|
||||||
|
|
||||||
prometheus_client.core.REGISTRY.register(collector)
|
while True:
|
||||||
prometheus_client.start_http_server(exporter_port, exporter_address)
|
logging.info(
|
||||||
|
"Refreshing stats every {0} seconds".format(exporter_refresh_interval)
|
||||||
|
)
|
||||||
|
time.sleep(exporter_refresh_interval)
|
||||||
|
collector.refresh()
|
||||||
|
|
||||||
logging.info("Server listening in http://%s:%d/metrics", exporter_address, exporter_port)
|
except KeyboardInterrupt:
|
||||||
while True:
|
logging.info("\nInterrupted")
|
||||||
logging.info("Refreshing stats every %d seconds", exporter_refresh_interval)
|
exit(0)
|
||||||
time.sleep(exporter_refresh_interval)
|
|
||||||
collector.refresh()
|
|
||||||
|
|
Loading…
Reference in a new issue