From 60617651cb98cec4d53fcad0bcfa43af6f2cce59 Mon Sep 17 00:00:00 2001 From: Konstantin Shalygin Date: Sun, 5 Mar 2023 23:03:48 +0700 Subject: [PATCH] Performance improvements for restic-exporter --- CHANGELOG.md | 8 ++ README.md | 44 +++++--- restic-exporter.py | 267 ++++++++++++++++++++++++++++++--------------- 3 files changed, 221 insertions(+), 98 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b95af4..5a84c60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## next_release + +* Added EXIT_ON_ERROR env var - now is possible to control exit_on_errors +behaviour +* Added NO_CHECK env var, now is possible not perform restic check operation +* Added NO_STATS env var, now is possible not collect per backup stats +* Added backup tag to the metric labels (if tags is present) + ## 1.1.0 (2023/02/02) * Update Restic 0.15.1 diff --git a/README.md b/README.md index 7b673db..e5ade10 100644 --- a/README.md +++ b/README.md @@ -97,38 +97,56 @@ All configuration is done with environment variables: * Amazon S3: `s3:s3.amazonaws.com/bucket_name` * Backblaze B2: `b2:bucketname:path/to/repo` -- `RESTIC_REPO_PASSWORD`: Restic repository password in plain text. This is only required if `RESTIC_REPO_PASSWORD_FILE` is not defined. -- `RESTIC_REPO_PASSWORD_FILE`: File with the Restic repository password in plain text. This is only required if `RESTIC_REPO_PASSWORD` is not defined. Remember to mount the Docker volume with the file. -- `AWS_ACCESS_KEY_ID`: (Optional) Required for Amazon S3, Minio and Wasabi backends. -- `AWS_SECRET_ACCESS_KEY`: (Optional) Required for Amazon S3, Minio and Wasabi backends. +- `RESTIC_REPO_PASSWORD`: Restic repository password in plain text. This is only +required if `RESTIC_REPO_PASSWORD_FILE` is not defined. +- `RESTIC_REPO_PASSWORD_FILE`: File with the Restic repository password in plain +text. This is only required if `RESTIC_REPO_PASSWORD` is not defined. Remember +to mount the Docker volume with the file. +- `AWS_ACCESS_KEY_ID`: (Optional) Required for Amazon S3, Minio and Wasabi +backends. +- `AWS_SECRET_ACCESS_KEY`: (Optional) Required for Amazon S3, Minio and Wasabi +backends. - `B2_ACCOUNT_ID`: (Optional) Required for Backblaze B2 backend. - `B2_ACCOUNT_KEY`: (Optional) Required for Backblaze B2 backend. -- `REFRESH_INTERVAL`: (Optional) Refresh interval for the metrics in seconds. Computing the metrics is a expensive task, keep this value as high as possible. Default 60 -- `LISTEN_PORT`: (Optional) The address the exporter should listen on. The default is `8001`. -- `LISTEN_ADDRESS`: (Optional) The address the exporter should listen on. The default is to listen on all addresses. +- `REFRESH_INTERVAL`: (Optional) Refresh interval for the metrics in seconds. +Computing the metrics is a expensive task, keep this value as high as possible. +Default is `60` seconds. +- `LISTEN_PORT`: (Optional) The address the exporter should listen on. The +default is `8001`. +- `LISTEN_ADDRESS`: (Optional) The address the exporter should listen on. The +default is to listen on all addresses. - `LOG_LEVEL`: (Optional) Log level of the traces. The default is `INFO`. +- `EXIT_ON_ERROR`: (Optional) Shutdown exporter on any `restic` error. Default +is `Flase` (only log error, such as network error with Cloud backends). +- `NO_CHECK`: (Optional) Do not perform `restic check` operation for performance +reasons. Default is `False` (perform `restic check`). +- `NO_STATS`: (Optional) Do not collect per backup statistics for performance +reasons. Default is `False` (collect per backup statistics). ## Exported metrics -```shell +```python # HELP restic_check_success Result of restic check operation in the repository # TYPE restic_check_success gauge restic_check_success 1.0 # HELP restic_snapshots_total Total number of snapshots in the repository # TYPE restic_snapshots_total counter -restic_snapshots_total 1777.0 +restic_snapshots_total 100.0 # HELP restic_backup_timestamp Timestamp of the last backup # TYPE restic_backup_timestamp gauge -restic_backup_timestamp{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 1.669754009e+09 +restic_backup_timestamp{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 1.666273638e+09 # HELP restic_backup_files_total Number of files in the backup # TYPE restic_backup_files_total counter -restic_backup_files_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 19051.0 +restic_backup_files_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 8.0 # HELP restic_backup_size_total Total size of backup in bytes # TYPE restic_backup_size_total counter -restic_backup_size_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 4.1174838248e+010 +restic_backup_size_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 4.3309562e+07 # HELP restic_backup_snapshots_total Total number of snapshots # TYPE restic_backup_snapshots_total counter -restic_backup_snapshots_total{client_hostname="PC-HOME-1",client_username="PC-HOME-1\\User-1",snapshot_hash="1911eb846f1642c327936915f1fad4e16190d0ab6b68e045294f5f0280a00ebe"} 106.0 +restic_backup_snapshots_total{client_hostname="product.example.com",client_username="root",snapshot_hash="20795072cba0953bcdbe52e9cf9d75e5726042f5bbf2584bb2999372398ee835",snapshot_tag="mysql"} 1.0 +# HELP restic_scrape_duration_seconds Ammount of time each scrape takes +# TYPE restic_scrape_duration_seconds gauge +restic_scrape_duration_seconds 166.9411084651947 ``` ## Prometheus config diff --git a/restic-exporter.py b/restic-exporter.py index 08799e5..03edee7 100644 --- a/restic-exporter.py +++ b/restic-exporter.py @@ -10,20 +10,27 @@ import subprocess import sys import traceback -import prometheus_client -import prometheus_client.core +from prometheus_client import start_http_server +from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY class ResticCollector(object): - def __init__(self, repository, password_file): + def __init__( + self, repository, password_file, exit_on_error, disable_check, disable_stats + ): self.repository = repository self.password_file = password_file + self.exit_on_error = exit_on_error + self.disable_check = disable_check + self.disable_stats = disable_stats # todo: the stats cache increases over time -> remove old ids - # todo: cold start -> the stats cache could be saved in a persistent volume - # todo: cold start -> the restic cache (/root/.cache/restic) could be saved in a persistent volume + # todo: cold start -> the stats cache could be saved in a persistent + # volume + # todo: cold start -> the restic cache (/root/.cache/restic) could be + # saved in a persistent volume self.stats_cache = {} self.metrics = {} - self.refresh(True) + self.refresh(exit_on_error) def collect(self): logging.debug("Incoming request") @@ -31,52 +38,71 @@ class ResticCollector(object): common_label_names = [ "client_hostname", "client_username", - "snapshot_hash" + "snapshot_hash", + "snapshot_tag", ] - check_success = prometheus_client.core.GaugeMetricFamily( + check_success = GaugeMetricFamily( "restic_check_success", "Result of restic check operation in the repository", - labels=[]) + labels=[], + ) - snapshots_total = prometheus_client.core.CounterMetricFamily( + snapshots_total = CounterMetricFamily( "restic_snapshots_total", "Total number of snapshots in the repository", - labels=[]) + labels=[], + ) - backup_timestamp = prometheus_client.core.GaugeMetricFamily( + backup_timestamp = GaugeMetricFamily( "restic_backup_timestamp", "Timestamp of the last backup", - labels=common_label_names) + labels=common_label_names, + ) - backup_files_total = prometheus_client.core.CounterMetricFamily( + backup_files_total = CounterMetricFamily( "restic_backup_files_total", "Number of files in the backup", - labels=common_label_names) + labels=common_label_names, + ) - backup_size_total = prometheus_client.core.CounterMetricFamily( + backup_size_total = CounterMetricFamily( "restic_backup_size_total", "Total size of backup in bytes", - labels=common_label_names) + labels=common_label_names, + ) - backup_snapshots_total = prometheus_client.core.CounterMetricFamily( + backup_snapshots_total = CounterMetricFamily( "restic_backup_snapshots_total", "Total number of snapshots", - labels=common_label_names) + labels=common_label_names, + ) + + scrape_duration_seconds = GaugeMetricFamily( + "restic_scrape_duration_seconds", + "Ammount of time each scrape takes", + labels=[], + ) check_success.add_metric([], self.metrics["check_success"]) snapshots_total.add_metric([], self.metrics["snapshots_total"]) - for client in self.metrics['clients']: + for client in self.metrics["clients"]: common_label_values = [ client["hostname"], client["username"], - client["snapshot_hash"] + client["snapshot_hash"], + client["snapshot_tag"], ] + backup_timestamp.add_metric(common_label_values, client["timestamp"]) backup_files_total.add_metric(common_label_values, client["files_total"]) backup_size_total.add_metric(common_label_values, client["size_total"]) - backup_snapshots_total.add_metric(common_label_values, client["snapshots_total"]) + backup_snapshots_total.add_metric( + common_label_values, client["snapshots_total"] + ) + + scrape_duration_seconds.add_metric([], self.metrics["duration"]) yield check_success yield snapshots_total @@ -84,75 +110,117 @@ class ResticCollector(object): yield backup_files_total yield backup_size_total yield backup_snapshots_total + yield scrape_duration_seconds def refresh(self, exit_on_error=False): try: self.metrics = self.get_metrics() except Exception: - logging.error("Unable to collect metrics from Restic. %s", traceback.format_exc(0).replace("\n", " ")) + logging.error( + "Unable to collect metrics from Restic. %s", + traceback.format_exc(0).replace("\n", " "), + ) + + # Shutdown exporter for any error if exit_on_error: sys.exit(1) def get_metrics(self): + duration = time.time() all_snapshots = self.get_snapshots() latest_snapshots = self.get_snapshots(True) clients = [] for snap in latest_snapshots: - stats = self.get_stats(snap['id']) + # Collect stats for each snap only if enabled + if self.disable_stats: + # return zero as "no-stats" value + stats = { + "total_size": -1, + "total_file_count": -1, + } + else: + stats = self.get_stats(snap["id"]) - time_parsed = re.sub(r'\.[^+-]+', '', snap['time']) + # use first element of tags if tags is present + if "tags" in snap: + tag = snap["tags"][0] + else: + tag = "" + + time_parsed = re.sub(r"\.[^+-]+", "", snap["time"]) if len(time_parsed) > 19: - # restic 14: '2023-01-12T06:59:33.1576588+01:00' -> '2023-01-12T06:59:33+01:00' + # restic 14: '2023-01-12T06:59:33.1576588+01:00' -> + # '2023-01-12T06:59:33+01:00' time_format = "%Y-%m-%dT%H:%M:%S%z" else: - # restic 12: '2023-02-01T14:14:19.30760523Z' -> '2023-02-01T14:14:19' + # restic 12: '2023-02-01T14:14:19.30760523Z' -> + # '2023-02-01T14:14:19' time_format = "%Y-%m-%dT%H:%M:%S" - timestamp = time.mktime(datetime.datetime.strptime(time_parsed, time_format).timetuple()) + timestamp = time.mktime( + datetime.datetime.strptime(time_parsed, time_format).timetuple() + ) snapshots_total = 0 for snap2 in all_snapshots: - if snap2['hash'] == snap['hash']: + if snap2["hash"] == snap["hash"]: snapshots_total += 1 - clients.append({ - 'snapshot_hash': snap['hash'], - 'hostname': snap['hostname'], - 'username': snap['username'], - 'timestamp': timestamp, - 'size_total': stats['total_size'], - 'files_total': stats['total_file_count'], - 'snapshots_total': snapshots_total - }) + clients.append( + { + "hostname": snap["hostname"], + "username": snap["username"], + "snapshot_hash": snap["hash"], + "snapshot_tag": tag, + "timestamp": timestamp, + "size_total": stats["total_size"], + "files_total": stats["total_file_count"], + "snapshots_total": snapshots_total, + } + ) # todo: fix the commented code when the bug is fixed in restic # https://github.com/restic/restic/issues/2126 # stats = self.get_stats() - check_success = self.get_check() + + if self.disable_check: + # return 2 as "no-check" value + check_success = 2 + else: + check_success = self.get_check() + metrics = { - 'check_success': check_success, - 'clients': clients, + "check_success": check_success, + "clients": clients, + "snapshots_total": len(all_snapshots), + "duration": time.time() - duration # 'size_total': stats['total_size'], # 'files_total': stats['total_file_count'], - 'snapshots_total': len(all_snapshots) } + return metrics def get_snapshots(self, only_latest=False): cmd = [ - 'restic', - '-r', self.repository, - '-p', self.password_file, - '--no-lock', - 'snapshots', '--json' + "restic", + "-r", + self.repository, + "-p", + self.password_file, + "--no-lock", + "snapshots", + "--json", ] + if only_latest: - cmd.extend(['--latest', '1']) + cmd.extend(["--latest", "1"]) result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode != 0: - raise Exception("Error executing restic snapshot command. " + self.parse_stderr(result)) - snapshots = json.loads(result.stdout.decode('utf-8')) + raise Exception( + "Error executing restic snapshot command: " + self.parse_stderr(result) + ) + snapshots = json.loads(result.stdout.decode("utf-8")) for snap in snapshots: - snap['hash'] = self.calc_snapshot_hash(snap) + snap["hash"] = self.calc_snapshot_hash(snap) return snapshots def get_stats(self, snapshot_id=None): @@ -163,19 +231,24 @@ class ResticCollector(object): return self.stats_cache[snapshot_id] cmd = [ - 'restic', - '-r', self.repository, - '-p', self.password_file, - '--no-lock', - 'stats', '--json' + "restic", + "-r", + self.repository, + "-p", + self.password_file, + "--no-lock", + "stats", + "--json", ] if snapshot_id is not None: cmd.extend([snapshot_id]) result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode != 0: - raise Exception("Error executing restic stats command. " + self.parse_stderr(result)) - stats = json.loads(result.stdout.decode('utf-8')) + raise Exception( + "Error executing restic stats command: " + self.parse_stderr(result) + ) + stats = json.loads(result.stdout.decode("utf-8")) if snapshot_id is not None: self.stats_cache[snapshot_id] = stats @@ -185,62 +258,86 @@ class ResticCollector(object): def get_check(self): # This command takes 20 seconds or more, but it's required cmd = [ - 'restic', - '-r', self.repository, - '-p', self.password_file, - '--no-lock', - 'check' + "restic", + "-r", + self.repository, + "-p", + self.password_file, + "--no-lock", + "check", ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode == 0: return 1 # ok - logging.warning("Error checking the repository health. " + self.parse_stderr(result)) - return 0 # error + else: + logging.warning( + "Error checking the repository health. " + self.parse_stderr(result) + ) + return 0 # error def calc_snapshot_hash(self, snapshot: dict) -> str: - text = snapshot['hostname'] + ",".join(snapshot['paths']) - return hashlib.sha256(text.encode('utf-8')).hexdigest() + text = snapshot["hostname"] + ",".join(snapshot["paths"]) + return hashlib.sha256(text.encode("utf-8")).hexdigest() def parse_stderr(self, result): - return result.stderr.decode('utf-8').replace("\n", " ") + " Exit code: " + str(result.returncode) + return ( + result.stderr.decode("utf-8").replace("\n", " ") + + " Exit code: " + + str(result.returncode) + ) if __name__ == "__main__": logging.basicConfig( - format='%(asctime)s %(levelname)-8s %(message)s', + format="%(asctime)s %(levelname)-8s %(message)s", level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")), - datefmt='%Y-%m-%d %H:%M:%S', - handlers=[ - logging.StreamHandler(sys.stdout) - ] + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], ) - logging.info("Starting Restic Prometheus Exporter ...") - logging.info("It could take a while if the repository is remote.") + logging.info("Starting Restic Prometheus Exporter") + logging.info("It could take a while if the repository is remote") try: restic_repo_url = os.environ["RESTIC_REPO_URL"] except Exception: - logging.error("Configuration error. The environment variable RESTIC_REPO_URL is mandatory") + logging.error("The environment variable RESTIC_REPO_URL is mandatory") sys.exit(1) try: restic_repo_password_file = os.environ["RESTIC_REPO_PASSWORD_FILE"] except Exception: - logging.error("Configuration error. The environment variable RESTIC_REPO_PASSWORD_FILE is mandatory") + logging.error("The environment variable RESTIC_REPO_PASSWORD_FILE is mandatory") sys.exit(1) exporter_address = os.environ.get("LISTEN_ADDRESS", "0.0.0.0") exporter_port = int(os.environ.get("LISTEN_PORT", 8001)) exporter_refresh_interval = int(os.environ.get("REFRESH_INTERVAL", 60)) + exporter_exit_on_error = bool(os.environ.get("EXIT_ON_ERROR", False)) + exporter_disable_check = bool(os.environ.get("NO_CHECK", False)) + exporter_disable_stats = bool(os.environ.get("NO_STATS", False)) - collector = ResticCollector(restic_repo_url, restic_repo_password_file) + try: + collector = ResticCollector( + restic_repo_url, + restic_repo_password_file, + exporter_exit_on_error, + exporter_disable_check, + exporter_disable_stats, + ) + REGISTRY.register(collector) + start_http_server(exporter_port, exporter_address) + logging.info( + "Serving at http://{0}:{1}".format(exporter_address, exporter_port) + ) - prometheus_client.core.REGISTRY.register(collector) - prometheus_client.start_http_server(exporter_port, exporter_address) + while True: + logging.info( + "Refreshing stats every {0} seconds".format(exporter_refresh_interval) + ) + time.sleep(exporter_refresh_interval) + collector.refresh() - logging.info("Server listening in http://%s:%d/metrics", exporter_address, exporter_port) - while True: - logging.info("Refreshing stats every %d seconds", exporter_refresh_interval) - time.sleep(exporter_refresh_interval) - collector.refresh() + except KeyboardInterrupt: + logging.info("\nInterrupted") + exit(0)