234 lines
8.2 KiB
Python
234 lines
8.2 KiB
Python
|
#!/usr/bin/env python3
|
||
|
import datetime
|
||
|
import hashlib
|
||
|
import json
|
||
|
import logging
|
||
|
import os
|
||
|
import time
|
||
|
import re
|
||
|
import subprocess
|
||
|
import sys
|
||
|
|
||
|
import prometheus_client
|
||
|
import prometheus_client.core
|
||
|
|
||
|
|
||
|
class ResticCollector(object):
|
||
|
def __init__(self, repository, password_file_):
|
||
|
self.repository = repository
|
||
|
self.password_file = password_file_
|
||
|
# todo: the stats cache increases over time -> remove old ids
|
||
|
# todo: cold start -> the stats cache could be saved in a persistent volume
|
||
|
# todo: cold start -> the restic cache (/root/.cache/restic) could be saved in a persistent volume
|
||
|
self.stats_cache = {}
|
||
|
self.metrics = {}
|
||
|
self.refresh()
|
||
|
|
||
|
def collect(self):
|
||
|
logging.debug("Incoming request")
|
||
|
|
||
|
common_label_names = [
|
||
|
"client_hostname",
|
||
|
"client_username",
|
||
|
"snapshot_hash"
|
||
|
]
|
||
|
|
||
|
check_success = prometheus_client.core.GaugeMetricFamily(
|
||
|
"restic_check_success",
|
||
|
"Result of restic check operation in the repository",
|
||
|
labels=[])
|
||
|
|
||
|
snapshots_total = prometheus_client.core.CounterMetricFamily(
|
||
|
"restic_snapshots_total",
|
||
|
"Total number of snapshots in the repository",
|
||
|
labels=[])
|
||
|
|
||
|
backup_timestamp = prometheus_client.core.GaugeMetricFamily(
|
||
|
"restic_backup_timestamp",
|
||
|
"Timestamp of the last backup",
|
||
|
labels=common_label_names)
|
||
|
|
||
|
backup_files_total = prometheus_client.core.CounterMetricFamily(
|
||
|
"restic_backup_files_total",
|
||
|
"Number of files in the backup",
|
||
|
labels=common_label_names)
|
||
|
|
||
|
backup_size_total = prometheus_client.core.CounterMetricFamily(
|
||
|
"restic_backup_size_total",
|
||
|
"Total size of backup in bytes",
|
||
|
labels=common_label_names)
|
||
|
|
||
|
backup_snapshots_total = prometheus_client.core.CounterMetricFamily(
|
||
|
"restic_backup_snapshots_total",
|
||
|
"Total number of snapshots",
|
||
|
labels=common_label_names)
|
||
|
|
||
|
check_success.add_metric([], self.metrics["check_success"])
|
||
|
snapshots_total.add_metric([], self.metrics["snapshots_total"])
|
||
|
|
||
|
for client in self.metrics['clients']:
|
||
|
common_label_values = [
|
||
|
client["hostname"],
|
||
|
client["username"],
|
||
|
client["snapshot_hash"]
|
||
|
]
|
||
|
backup_timestamp.add_metric(common_label_values, client["timestamp"])
|
||
|
backup_files_total.add_metric(common_label_values, client["files_total"])
|
||
|
backup_size_total.add_metric(common_label_values, client["size_total"])
|
||
|
backup_snapshots_total.add_metric(common_label_values, client["snapshots_total"])
|
||
|
|
||
|
yield check_success
|
||
|
yield snapshots_total
|
||
|
yield backup_timestamp
|
||
|
yield backup_files_total
|
||
|
yield backup_size_total
|
||
|
yield backup_snapshots_total
|
||
|
|
||
|
def refresh(self):
|
||
|
try:
|
||
|
self.metrics = self.get_metrics()
|
||
|
except Exception as e:
|
||
|
logging.error("Unable to collect metrics from Restic. Error: %s", str(e))
|
||
|
|
||
|
def get_metrics(self):
|
||
|
all_snapshots = self.get_snapshots()
|
||
|
latest_snapshots = self.get_snapshots(True)
|
||
|
clients = []
|
||
|
for snap in latest_snapshots:
|
||
|
stats = self.get_stats(snap['id'])
|
||
|
|
||
|
time_parsed = re.sub(r'\.[^+-]+', '', snap['time'])
|
||
|
timestamp = time.mktime(datetime.datetime.strptime(time_parsed, "%Y-%m-%dT%H:%M:%S%z").timetuple())
|
||
|
|
||
|
snapshots_total = 0
|
||
|
for snap2 in all_snapshots:
|
||
|
if snap2['hash'] == snap['hash']:
|
||
|
snapshots_total += 1
|
||
|
|
||
|
clients.append({
|
||
|
'snapshot_hash': snap['hash'],
|
||
|
'hostname': snap['hostname'],
|
||
|
'username': snap['username'],
|
||
|
'timestamp': timestamp,
|
||
|
'size_total': stats['total_size'],
|
||
|
'files_total': stats['total_file_count'],
|
||
|
'snapshots_total': snapshots_total
|
||
|
})
|
||
|
# todo: fix the commented code when the bug is fixed in restic
|
||
|
# https://github.com/restic/restic/issues/2126
|
||
|
# stats = self.get_stats()
|
||
|
check_success = self.get_check()
|
||
|
metrics = {
|
||
|
'check_success': check_success,
|
||
|
'clients': clients,
|
||
|
# 'size_total': stats['total_size'],
|
||
|
# 'files_total': stats['total_file_count'],
|
||
|
'snapshots_total': len(all_snapshots)
|
||
|
}
|
||
|
return metrics
|
||
|
|
||
|
def get_snapshots(self, only_latest=False):
|
||
|
cmd = [
|
||
|
'restic',
|
||
|
'-r', self.repository,
|
||
|
'-p', self.password_file,
|
||
|
'--no-lock',
|
||
|
'snapshots', '--json'
|
||
|
]
|
||
|
if only_latest:
|
||
|
cmd.extend(['--latest', '1'])
|
||
|
|
||
|
result = subprocess.run(cmd, stdout=subprocess.PIPE)
|
||
|
if result.returncode != 0:
|
||
|
raise Exception("Error executing restic snapshot command. Exit code: " + str(result.returncode))
|
||
|
snapshots = json.loads(result.stdout.decode('utf-8'))
|
||
|
for snap in snapshots:
|
||
|
snap['hash'] = self.calc_snapshot_hash(snap)
|
||
|
return snapshots
|
||
|
|
||
|
def get_stats(self, snapshot_id=None):
|
||
|
# This command is expensive in CPU/Memory (1-5 seconds),
|
||
|
# and much more when snapshot_id=None (3 minutes) -> we avoid this call for now
|
||
|
# https://github.com/restic/restic/issues/2126
|
||
|
if snapshot_id is not None and snapshot_id in self.stats_cache:
|
||
|
return self.stats_cache[snapshot_id]
|
||
|
|
||
|
cmd = [
|
||
|
'restic',
|
||
|
'-r', self.repository,
|
||
|
'-p', self.password_file,
|
||
|
'--no-lock',
|
||
|
'stats', '--json'
|
||
|
]
|
||
|
if snapshot_id is not None:
|
||
|
cmd.extend([snapshot_id])
|
||
|
|
||
|
result = subprocess.run(cmd, stdout=subprocess.PIPE)
|
||
|
if result.returncode != 0:
|
||
|
raise Exception("Error executing restic stats command. Exit code: " + str(result.returncode))
|
||
|
stats = json.loads(result.stdout.decode('utf-8'))
|
||
|
|
||
|
if snapshot_id is not None:
|
||
|
self.stats_cache[snapshot_id] = stats
|
||
|
|
||
|
return stats
|
||
|
|
||
|
def get_check(self):
|
||
|
# This command takes 20 seconds or more, but it's required
|
||
|
cmd = [
|
||
|
'restic',
|
||
|
'-r', self.repository,
|
||
|
'-p', self.password_file,
|
||
|
'--no-lock',
|
||
|
'check'
|
||
|
]
|
||
|
|
||
|
result = subprocess.run(cmd, stdout=subprocess.PIPE)
|
||
|
if result.returncode == 0:
|
||
|
return 1 # ok
|
||
|
return 0 # error
|
||
|
|
||
|
def calc_snapshot_hash(self, snapshot: dict) -> str:
|
||
|
text = snapshot['hostname'] + ",".join(snapshot['paths'])
|
||
|
return hashlib.sha256(text.encode('utf-8')).hexdigest()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
logging.basicConfig(
|
||
|
format='%(asctime)s %(levelname)-8s %(message)s',
|
||
|
level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")),
|
||
|
datefmt='%Y-%m-%d %H:%M:%S',
|
||
|
handlers=[
|
||
|
logging.StreamHandler(sys.stdout)
|
||
|
]
|
||
|
)
|
||
|
logging.info("Starting Restic Prometheus Exporter ...")
|
||
|
logging.info("It could take a while if the repository is remote.")
|
||
|
|
||
|
try:
|
||
|
restic_repo_url = os.environ["RESTIC_REPO_URL"]
|
||
|
except Exception:
|
||
|
logging.error("Configuration error. The environment variable RESTIC_REPO_URL is mandatory")
|
||
|
sys.exit(1)
|
||
|
|
||
|
try:
|
||
|
password_file = os.environ["PASSWORD_FILE"]
|
||
|
except Exception:
|
||
|
logging.error("Configuration error. The environment variable PASSWORD_FILE is mandatory")
|
||
|
sys.exit(1)
|
||
|
|
||
|
exporter_address = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
|
||
|
exporter_port = int(os.environ.get("LISTEN_PORT", 8001))
|
||
|
exporter_refresh_interval = int(os.environ.get("REFRESH_INTERVAL", 60))
|
||
|
|
||
|
collector = ResticCollector(restic_repo_url, password_file)
|
||
|
|
||
|
prometheus_client.core.REGISTRY.register(collector)
|
||
|
prometheus_client.start_http_server(exporter_port, exporter_address)
|
||
|
|
||
|
logging.info("Server listening in http://%s:%d/metrics", exporter_address, exporter_port)
|
||
|
while True:
|
||
|
logging.info("Refreshing stats every %d seconds", exporter_refresh_interval)
|
||
|
time.sleep(exporter_refresh_interval)
|
||
|
collector.refresh()
|