Improve error handling and better log traces
This commit is contained in:
parent
84190584b8
commit
864f63600a
1 changed files with 16 additions and 9 deletions
|
@ -8,6 +8,7 @@ import time
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
import prometheus_client
|
import prometheus_client
|
||||||
import prometheus_client.core
|
import prometheus_client.core
|
||||||
|
@ -22,7 +23,7 @@ class ResticCollector(object):
|
||||||
# todo: cold start -> the restic cache (/root/.cache/restic) could be saved in a persistent volume
|
# todo: cold start -> the restic cache (/root/.cache/restic) could be saved in a persistent volume
|
||||||
self.stats_cache = {}
|
self.stats_cache = {}
|
||||||
self.metrics = {}
|
self.metrics = {}
|
||||||
self.refresh()
|
self.refresh(True)
|
||||||
|
|
||||||
def collect(self):
|
def collect(self):
|
||||||
logging.debug("Incoming request")
|
logging.debug("Incoming request")
|
||||||
|
@ -84,11 +85,13 @@ class ResticCollector(object):
|
||||||
yield backup_size_total
|
yield backup_size_total
|
||||||
yield backup_snapshots_total
|
yield backup_snapshots_total
|
||||||
|
|
||||||
def refresh(self):
|
def refresh(self, exit_on_error=False):
|
||||||
try:
|
try:
|
||||||
self.metrics = self.get_metrics()
|
self.metrics = self.get_metrics()
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logging.error("Unable to collect metrics from Restic. Error: %s", str(e))
|
logging.error("Unable to collect metrics from Restic. %s", traceback.format_exc(0).replace("\n", " "))
|
||||||
|
if exit_on_error:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def get_metrics(self):
|
def get_metrics(self):
|
||||||
all_snapshots = self.get_snapshots()
|
all_snapshots = self.get_snapshots()
|
||||||
|
@ -144,9 +147,9 @@ class ResticCollector(object):
|
||||||
if only_latest:
|
if only_latest:
|
||||||
cmd.extend(['--latest', '1'])
|
cmd.extend(['--latest', '1'])
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
raise Exception("Error executing restic snapshot command. Exit code: " + str(result.returncode))
|
raise Exception("Error executing restic snapshot command. " + self.parse_stderr(result))
|
||||||
snapshots = json.loads(result.stdout.decode('utf-8'))
|
snapshots = json.loads(result.stdout.decode('utf-8'))
|
||||||
for snap in snapshots:
|
for snap in snapshots:
|
||||||
snap['hash'] = self.calc_snapshot_hash(snap)
|
snap['hash'] = self.calc_snapshot_hash(snap)
|
||||||
|
@ -169,9 +172,9 @@ class ResticCollector(object):
|
||||||
if snapshot_id is not None:
|
if snapshot_id is not None:
|
||||||
cmd.extend([snapshot_id])
|
cmd.extend([snapshot_id])
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
raise Exception("Error executing restic stats command. Exit code: " + str(result.returncode))
|
raise Exception("Error executing restic stats command. " + self.parse_stderr(result))
|
||||||
stats = json.loads(result.stdout.decode('utf-8'))
|
stats = json.loads(result.stdout.decode('utf-8'))
|
||||||
|
|
||||||
if snapshot_id is not None:
|
if snapshot_id is not None:
|
||||||
|
@ -189,15 +192,19 @@ class ResticCollector(object):
|
||||||
'check'
|
'check'
|
||||||
]
|
]
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
return 1 # ok
|
return 1 # ok
|
||||||
|
logging.warning("Error checking the repository health. " + self.parse_stderr(result))
|
||||||
return 0 # error
|
return 0 # error
|
||||||
|
|
||||||
def calc_snapshot_hash(self, snapshot: dict) -> str:
|
def calc_snapshot_hash(self, snapshot: dict) -> str:
|
||||||
text = snapshot['hostname'] + ",".join(snapshot['paths'])
|
text = snapshot['hostname'] + ",".join(snapshot['paths'])
|
||||||
return hashlib.sha256(text.encode('utf-8')).hexdigest()
|
return hashlib.sha256(text.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
def parse_stderr(self, result):
|
||||||
|
return result.stderr.decode('utf-8').replace("\n", " ") + " Exit code: " + str(result.returncode)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
|
|
Loading…
Reference in a new issue