Performance improvements for restic-exporter

This commit is contained in:
Konstantin Shalygin 2023-03-05 23:03:48 +07:00
parent 880b47131c
commit 60617651cb
No known key found for this signature in database
GPG key ID: 3C160886BF25D873
3 changed files with 221 additions and 98 deletions

View file

@ -1,5 +1,13 @@
# Changelog # Changelog
## next_release
* Added EXIT_ON_ERROR env var - now is possible to control exit_on_errors
behaviour
* Added NO_CHECK env var, now is possible not perform restic check operation
* Added NO_STATS env var, now is possible not collect per backup stats
* Added backup tag to the metric labels (if tags is present)
## 1.1.0 (2023/02/02) ## 1.1.0 (2023/02/02)
* Update Restic 0.15.1 * Update Restic 0.15.1

View file

@ -97,38 +97,56 @@ All configuration is done with environment variables:
* Amazon S3: `s3:s3.amazonaws.com/bucket_name` * Amazon S3: `s3:s3.amazonaws.com/bucket_name`
* Backblaze B2: `b2:bucketname:path/to/repo` * Backblaze B2: `b2:bucketname:path/to/repo`
- `RESTIC_REPO_PASSWORD`: Restic repository password in plain text. This is only required if `RESTIC_REPO_PASSWORD_FILE` is not defined. - `RESTIC_REPO_PASSWORD`: Restic repository password in plain text. This is only
- `RESTIC_REPO_PASSWORD_FILE`: File with the Restic repository password in plain text. This is only required if `RESTIC_REPO_PASSWORD` is not defined. Remember to mount the Docker volume with the file. required if `RESTIC_REPO_PASSWORD_FILE` is not defined.
- `AWS_ACCESS_KEY_ID`: (Optional) Required for Amazon S3, Minio and Wasabi backends. - `RESTIC_REPO_PASSWORD_FILE`: File with the Restic repository password in plain
- `AWS_SECRET_ACCESS_KEY`: (Optional) Required for Amazon S3, Minio and Wasabi backends. text. This is only required if `RESTIC_REPO_PASSWORD` is not defined. Remember
to mount the Docker volume with the file.
- `AWS_ACCESS_KEY_ID`: (Optional) Required for Amazon S3, Minio and Wasabi
backends.
- `AWS_SECRET_ACCESS_KEY`: (Optional) Required for Amazon S3, Minio and Wasabi
backends.
- `B2_ACCOUNT_ID`: (Optional) Required for Backblaze B2 backend. - `B2_ACCOUNT_ID`: (Optional) Required for Backblaze B2 backend.
- `B2_ACCOUNT_KEY`: (Optional) Required for Backblaze B2 backend. - `B2_ACCOUNT_KEY`: (Optional) Required for Backblaze B2 backend.
- `REFRESH_INTERVAL`: (Optional) Refresh interval for the metrics in seconds. Computing the metrics is a expensive task, keep this value as high as possible. Default 60 - `REFRESH_INTERVAL`: (Optional) Refresh interval for the metrics in seconds.
- `LISTEN_PORT`: (Optional) The address the exporter should listen on. The default is `8001`. Computing the metrics is a expensive task, keep this value as high as possible.
- `LISTEN_ADDRESS`: (Optional) The address the exporter should listen on. The default is to listen on all addresses. Default is `60` seconds.
- `LISTEN_PORT`: (Optional) The address the exporter should listen on. The
default is `8001`.
- `LISTEN_ADDRESS`: (Optional) The address the exporter should listen on. The
default is to listen on all addresses.
- `LOG_LEVEL`: (Optional) Log level of the traces. The default is `INFO`. - `LOG_LEVEL`: (Optional) Log level of the traces. The default is `INFO`.
- `EXIT_ON_ERROR`: (Optional) Shutdown exporter on any `restic` error. Default
is `Flase` (only log error, such as network error with Cloud backends).
- `NO_CHECK`: (Optional) Do not perform `restic check` operation for performance
reasons. Default is `False` (perform `restic check`).
- `NO_STATS`: (Optional) Do not collect per backup statistics for performance
reasons. Default is `False` (collect per backup statistics).
## Exported metrics ## Exported metrics
```shell ```python
# HELP restic_check_success Result of restic check operation in the repository # HELP restic_check_success Result of restic check operation in the repository
# TYPE restic_check_success gauge # TYPE restic_check_success gauge
restic_check_success 1.0 restic_check_success 1.0
# HELP restic_snapshots_total Total number of snapshots in the repository # HELP restic_snapshots_total Total number of snapshots in the repository
# TYPE restic_snapshots_total counter # TYPE restic_snapshots_total counter
restic_snapshots_total 1777.0 restic_snapshots_total 100.0
# HELP restic_backup_timestamp Timestamp of the last backup # HELP restic_backup_timestamp Timestamp of the last backup
# TYPE restic_backup_timestamp gauge # TYPE restic_backup_timestamp gauge
restic_backup_timestamp{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 1.669754009e+09 restic_backup_timestamp{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 1.666273638e+09
# HELP restic_backup_files_total Number of files in the backup # HELP restic_backup_files_total Number of files in the backup
# TYPE restic_backup_files_total counter # TYPE restic_backup_files_total counter
restic_backup_files_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 19051.0 restic_backup_files_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 8.0
# HELP restic_backup_size_total Total size of backup in bytes # HELP restic_backup_size_total Total size of backup in bytes
# TYPE restic_backup_size_total counter # TYPE restic_backup_size_total counter
restic_backup_size_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 4.1174838248e+010 restic_backup_size_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 4.3309562e+07
# HELP restic_backup_snapshots_total Total number of snapshots # HELP restic_backup_snapshots_total Total number of snapshots
# TYPE restic_backup_snapshots_total counter # TYPE restic_backup_snapshots_total counter
restic_backup_snapshots_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 106.0 restic_backup_snapshots_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 1.0
# HELP restic_scrape_duration_seconds Ammount of time each scrape takes
# TYPE restic_scrape_duration_seconds gauge
restic_scrape_duration_seconds 166.9411084651947
``` ```
## Prometheus config ## Prometheus config

View file

@ -10,20 +10,27 @@ import subprocess
import sys import sys
import traceback import traceback
import prometheus_client from prometheus_client import start_http_server
import prometheus_client.core from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
class ResticCollector(object): class ResticCollector(object):
def __init__(self, repository, password_file): def __init__(
self, repository, password_file, exit_on_error, disable_check, disable_stats
):
self.repository = repository self.repository = repository
self.password_file = password_file self.password_file = password_file
self.exit_on_error = exit_on_error
self.disable_check = disable_check
self.disable_stats = disable_stats
# todo: the stats cache increases over time -> remove old ids # todo: the stats cache increases over time -> remove old ids
# todo: cold start -> the stats cache could be saved in a persistent volume # todo: cold start -> the stats cache could be saved in a persistent
# todo: cold start -> the restic cache (/root/.cache/restic) could be saved in a persistent volume # volume
# todo: cold start -> the restic cache (/root/.cache/restic) could be
# saved in a persistent volume
self.stats_cache = {} self.stats_cache = {}
self.metrics = {} self.metrics = {}
self.refresh(True) self.refresh(exit_on_error)
def collect(self): def collect(self):
logging.debug("Incoming request") logging.debug("Incoming request")
@ -31,52 +38,71 @@ class ResticCollector(object):
common_label_names = [ common_label_names = [
"client_hostname", "client_hostname",
"client_username", "client_username",
"snapshot_hash" "snapshot_hash",
"snapshot_tag",
] ]
check_success = prometheus_client.core.GaugeMetricFamily( check_success = GaugeMetricFamily(
"restic_check_success", "restic_check_success",
"Result of restic check operation in the repository", "Result of restic check operation in the repository",
labels=[]) labels=[],
)
snapshots_total = prometheus_client.core.CounterMetricFamily( snapshots_total = CounterMetricFamily(
"restic_snapshots_total", "restic_snapshots_total",
"Total number of snapshots in the repository", "Total number of snapshots in the repository",
labels=[]) labels=[],
)
backup_timestamp = prometheus_client.core.GaugeMetricFamily( backup_timestamp = GaugeMetricFamily(
"restic_backup_timestamp", "restic_backup_timestamp",
"Timestamp of the last backup", "Timestamp of the last backup",
labels=common_label_names) labels=common_label_names,
)
backup_files_total = prometheus_client.core.CounterMetricFamily( backup_files_total = CounterMetricFamily(
"restic_backup_files_total", "restic_backup_files_total",
"Number of files in the backup", "Number of files in the backup",
labels=common_label_names) labels=common_label_names,
)
backup_size_total = prometheus_client.core.CounterMetricFamily( backup_size_total = CounterMetricFamily(
"restic_backup_size_total", "restic_backup_size_total",
"Total size of backup in bytes", "Total size of backup in bytes",
labels=common_label_names) labels=common_label_names,
)
backup_snapshots_total = prometheus_client.core.CounterMetricFamily( backup_snapshots_total = CounterMetricFamily(
"restic_backup_snapshots_total", "restic_backup_snapshots_total",
"Total number of snapshots", "Total number of snapshots",
labels=common_label_names) labels=common_label_names,
)
scrape_duration_seconds = GaugeMetricFamily(
"restic_scrape_duration_seconds",
"Ammount of time each scrape takes",
labels=[],
)
check_success.add_metric([], self.metrics["check_success"]) check_success.add_metric([], self.metrics["check_success"])
snapshots_total.add_metric([], self.metrics["snapshots_total"]) snapshots_total.add_metric([], self.metrics["snapshots_total"])
for client in self.metrics['clients']: for client in self.metrics["clients"]:
common_label_values = [ common_label_values = [
client["hostname"], client["hostname"],
client["username"], client["username"],
client["snapshot_hash"] client["snapshot_hash"],
client["snapshot_tag"],
] ]
backup_timestamp.add_metric(common_label_values, client["timestamp"]) backup_timestamp.add_metric(common_label_values, client["timestamp"])
backup_files_total.add_metric(common_label_values, client["files_total"]) backup_files_total.add_metric(common_label_values, client["files_total"])
backup_size_total.add_metric(common_label_values, client["size_total"]) backup_size_total.add_metric(common_label_values, client["size_total"])
backup_snapshots_total.add_metric(common_label_values, client["snapshots_total"]) backup_snapshots_total.add_metric(
common_label_values, client["snapshots_total"]
)
scrape_duration_seconds.add_metric([], self.metrics["duration"])
yield check_success yield check_success
yield snapshots_total yield snapshots_total
@ -84,75 +110,117 @@ class ResticCollector(object):
yield backup_files_total yield backup_files_total
yield backup_size_total yield backup_size_total
yield backup_snapshots_total yield backup_snapshots_total
yield scrape_duration_seconds
def refresh(self, exit_on_error=False): def refresh(self, exit_on_error=False):
try: try:
self.metrics = self.get_metrics() self.metrics = self.get_metrics()
except Exception: except Exception:
logging.error("Unable to collect metrics from Restic. %s", traceback.format_exc(0).replace("\n", " ")) logging.error(
"Unable to collect metrics from Restic. %s",
traceback.format_exc(0).replace("\n", " "),
)
# Shutdown exporter for any error
if exit_on_error: if exit_on_error:
sys.exit(1) sys.exit(1)
def get_metrics(self): def get_metrics(self):
duration = time.time()
all_snapshots = self.get_snapshots() all_snapshots = self.get_snapshots()
latest_snapshots = self.get_snapshots(True) latest_snapshots = self.get_snapshots(True)
clients = [] clients = []
for snap in latest_snapshots: for snap in latest_snapshots:
stats = self.get_stats(snap['id']) # Collect stats for each snap only if enabled
if self.disable_stats:
# return zero as "no-stats" value
stats = {
"total_size": -1,
"total_file_count": -1,
}
else:
stats = self.get_stats(snap["id"])
time_parsed = re.sub(r'\.[^+-]+', '', snap['time']) # use first element of tags if tags is present
if "tags" in snap:
tag = snap["tags"][0]
else:
tag = ""
time_parsed = re.sub(r"\.[^+-]+", "", snap["time"])
if len(time_parsed) > 19: if len(time_parsed) > 19:
# restic 14: '2023-01-12T06:59:33.1576588+01:00' -> '2023-01-12T06:59:33+01:00' # restic 14: '2023-01-12T06:59:33.1576588+01:00' ->
# '2023-01-12T06:59:33+01:00'
time_format = "%Y-%m-%dT%H:%M:%S%z" time_format = "%Y-%m-%dT%H:%M:%S%z"
else: else:
# restic 12: '2023-02-01T14:14:19.30760523Z' -> '2023-02-01T14:14:19' # restic 12: '2023-02-01T14:14:19.30760523Z' ->
# '2023-02-01T14:14:19'
time_format = "%Y-%m-%dT%H:%M:%S" time_format = "%Y-%m-%dT%H:%M:%S"
timestamp = time.mktime(datetime.datetime.strptime(time_parsed, time_format).timetuple()) timestamp = time.mktime(
datetime.datetime.strptime(time_parsed, time_format).timetuple()
)
snapshots_total = 0 snapshots_total = 0
for snap2 in all_snapshots: for snap2 in all_snapshots:
if snap2['hash'] == snap['hash']: if snap2["hash"] == snap["hash"]:
snapshots_total += 1 snapshots_total += 1
clients.append({ clients.append(
'snapshot_hash': snap['hash'], {
'hostname': snap['hostname'], "hostname": snap["hostname"],
'username': snap['username'], "username": snap["username"],
'timestamp': timestamp, "snapshot_hash": snap["hash"],
'size_total': stats['total_size'], "snapshot_tag": tag,
'files_total': stats['total_file_count'], "timestamp": timestamp,
'snapshots_total': snapshots_total "size_total": stats["total_size"],
}) "files_total": stats["total_file_count"],
"snapshots_total": snapshots_total,
}
)
# todo: fix the commented code when the bug is fixed in restic # todo: fix the commented code when the bug is fixed in restic
# https://github.com/restic/restic/issues/2126 # https://github.com/restic/restic/issues/2126
# stats = self.get_stats() # stats = self.get_stats()
check_success = self.get_check()
if self.disable_check:
# return 2 as "no-check" value
check_success = 2
else:
check_success = self.get_check()
metrics = { metrics = {
'check_success': check_success, "check_success": check_success,
'clients': clients, "clients": clients,
"snapshots_total": len(all_snapshots),
"duration": time.time() - duration
# 'size_total': stats['total_size'], # 'size_total': stats['total_size'],
# 'files_total': stats['total_file_count'], # 'files_total': stats['total_file_count'],
'snapshots_total': len(all_snapshots)
} }
return metrics return metrics
def get_snapshots(self, only_latest=False): def get_snapshots(self, only_latest=False):
cmd = [ cmd = [
'restic', "restic",
'-r', self.repository, "-r",
'-p', self.password_file, self.repository,
'--no-lock', "-p",
'snapshots', '--json' self.password_file,
"--no-lock",
"snapshots",
"--json",
] ]
if only_latest: if only_latest:
cmd.extend(['--latest', '1']) cmd.extend(["--latest", "1"])
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0: if result.returncode != 0:
raise Exception("Error executing restic snapshot command. " + self.parse_stderr(result)) raise Exception(
snapshots = json.loads(result.stdout.decode('utf-8')) "Error executing restic snapshot command: " + self.parse_stderr(result)
)
snapshots = json.loads(result.stdout.decode("utf-8"))
for snap in snapshots: for snap in snapshots:
snap['hash'] = self.calc_snapshot_hash(snap) snap["hash"] = self.calc_snapshot_hash(snap)
return snapshots return snapshots
def get_stats(self, snapshot_id=None): def get_stats(self, snapshot_id=None):
@ -163,19 +231,24 @@ class ResticCollector(object):
return self.stats_cache[snapshot_id] return self.stats_cache[snapshot_id]
cmd = [ cmd = [
'restic', "restic",
'-r', self.repository, "-r",
'-p', self.password_file, self.repository,
'--no-lock', "-p",
'stats', '--json' self.password_file,
"--no-lock",
"stats",
"--json",
] ]
if snapshot_id is not None: if snapshot_id is not None:
cmd.extend([snapshot_id]) cmd.extend([snapshot_id])
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0: if result.returncode != 0:
raise Exception("Error executing restic stats command. " + self.parse_stderr(result)) raise Exception(
stats = json.loads(result.stdout.decode('utf-8')) "Error executing restic stats command: " + self.parse_stderr(result)
)
stats = json.loads(result.stdout.decode("utf-8"))
if snapshot_id is not None: if snapshot_id is not None:
self.stats_cache[snapshot_id] = stats self.stats_cache[snapshot_id] = stats
@ -185,62 +258,86 @@ class ResticCollector(object):
def get_check(self): def get_check(self):
# This command takes 20 seconds or more, but it's required # This command takes 20 seconds or more, but it's required
cmd = [ cmd = [
'restic', "restic",
'-r', self.repository, "-r",
'-p', self.password_file, self.repository,
'--no-lock', "-p",
'check' self.password_file,
"--no-lock",
"check",
] ]
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode == 0: if result.returncode == 0:
return 1 # ok return 1 # ok
logging.warning("Error checking the repository health. " + self.parse_stderr(result)) else:
return 0 # error logging.warning(
"Error checking the repository health. " + self.parse_stderr(result)
)
return 0 # error
def calc_snapshot_hash(self, snapshot: dict) -> str: def calc_snapshot_hash(self, snapshot: dict) -> str:
text = snapshot['hostname'] + ",".join(snapshot['paths']) text = snapshot["hostname"] + ",".join(snapshot["paths"])
return hashlib.sha256(text.encode('utf-8')).hexdigest() return hashlib.sha256(text.encode("utf-8")).hexdigest()
def parse_stderr(self, result): def parse_stderr(self, result):
return result.stderr.decode('utf-8').replace("\n", " ") + " Exit code: " + str(result.returncode) return (
result.stderr.decode("utf-8").replace("\n", " ")
+ " Exit code: "
+ str(result.returncode)
)
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig( logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s', format="%(asctime)s %(levelname)-8s %(message)s",
level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")), level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")),
datefmt='%Y-%m-%d %H:%M:%S', datefmt="%Y-%m-%d %H:%M:%S",
handlers=[ handlers=[logging.StreamHandler(sys.stdout)],
logging.StreamHandler(sys.stdout)
]
) )
logging.info("Starting Restic Prometheus Exporter ...") logging.info("Starting Restic Prometheus Exporter")
logging.info("It could take a while if the repository is remote.") logging.info("It could take a while if the repository is remote")
try: try:
restic_repo_url = os.environ["RESTIC_REPO_URL"] restic_repo_url = os.environ["RESTIC_REPO_URL"]
except Exception: except Exception:
logging.error("Configuration error. The environment variable RESTIC_REPO_URL is mandatory") logging.error("The environment variable RESTIC_REPO_URL is mandatory")
sys.exit(1) sys.exit(1)
try: try:
restic_repo_password_file = os.environ["RESTIC_REPO_PASSWORD_FILE"] restic_repo_password_file = os.environ["RESTIC_REPO_PASSWORD_FILE"]
except Exception: except Exception:
logging.error("Configuration error. The environment variable RESTIC_REPO_PASSWORD_FILE is mandatory") logging.error("The environment variable RESTIC_REPO_PASSWORD_FILE is mandatory")
sys.exit(1) sys.exit(1)
exporter_address = os.environ.get("LISTEN_ADDRESS", "0.0.0.0") exporter_address = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
exporter_port = int(os.environ.get("LISTEN_PORT", 8001)) exporter_port = int(os.environ.get("LISTEN_PORT", 8001))
exporter_refresh_interval = int(os.environ.get("REFRESH_INTERVAL", 60)) exporter_refresh_interval = int(os.environ.get("REFRESH_INTERVAL", 60))
exporter_exit_on_error = bool(os.environ.get("EXIT_ON_ERROR", False))
exporter_disable_check = bool(os.environ.get("NO_CHECK", False))
exporter_disable_stats = bool(os.environ.get("NO_STATS", False))
collector = ResticCollector(restic_repo_url, restic_repo_password_file) try:
collector = ResticCollector(
restic_repo_url,
restic_repo_password_file,
exporter_exit_on_error,
exporter_disable_check,
exporter_disable_stats,
)
REGISTRY.register(collector)
start_http_server(exporter_port, exporter_address)
logging.info(
"Serving at http://{0}:{1}".format(exporter_address, exporter_port)
)
prometheus_client.core.REGISTRY.register(collector) while True:
prometheus_client.start_http_server(exporter_port, exporter_address) logging.info(
"Refreshing stats every {0} seconds".format(exporter_refresh_interval)
)
time.sleep(exporter_refresh_interval)
collector.refresh()
logging.info("Server listening in http://%s:%d/metrics", exporter_address, exporter_port) except KeyboardInterrupt:
while True: logging.info("\nInterrupted")
logging.info("Refreshing stats every %d seconds", exporter_refresh_interval) exit(0)
time.sleep(exporter_refresh_interval)
collector.refresh()