geostat/geoparser.py

128 lines
5.1 KiB
Python
Raw Normal View History

2018-10-08 22:25:40 +02:00
# Getting GEO information from Nginx access.log by IP's.
2018-10-08 17:13:00 +02:00
# Alexey Nizhegolenko 2018
# Parts added by Remko Lodder, 2019.
# Added: IPv6 matching, make query based on geoip2 instead of
# geoip, which is going away r.s.n.
2018-10-08 17:13:00 +02:00
import os
import re
2018-10-08 22:25:40 +02:00
import sys
2018-10-08 17:13:00 +02:00
import time
import geohash
import logging
import logging.handlers
import geoip2.database
2018-10-08 17:13:00 +02:00
import configparser
2018-10-08 18:00:33 +02:00
from influxdb import InfluxDBClient
from IPy import IP as ipadd
2018-10-08 17:13:00 +02:00
class SyslogBOMFormatter(logging.Formatter):
def format(self, record):
result = super().format(record)
return "ufeff" + result
handler = logging.handlers.SysLogHandler('/dev/log')
formatter = SyslogBOMFormatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
root = logging.getLogger(__name__)
root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
root.addHandler(handler)
def logparse(LOGPATH, INFLUXHOST, INFLUXPORT, INFLUXDBDB, INFLUXUSER, INFLUXUSERPASS, MEASUREMENT, GEOIPDB, INODE): # NOQA
2018-10-08 22:25:40 +02:00
# Preparing variables and params
IPS = {}
COUNT = {}
GEOHASH = {}
2018-10-12 20:38:06 +02:00
HOSTNAME = os.uname()[1]
2018-10-08 20:20:54 +02:00
CLIENT = InfluxDBClient(host=INFLUXHOST, port=INFLUXPORT,
username=INFLUXUSER, password=INFLUXUSERPASS, database=INFLUXDBDB) # NOQA
re_IPV4 = re.compile('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
re_IPV6 = re.compile('(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))') # NOQA
GI = geoip2.database.Reader(GEOIPDB)
2018-10-08 22:25:40 +02:00
# Main loop to parse access.log file in tailf style with sending metrcs
2018-10-08 18:00:33 +02:00
with open(LOGPATH, "r") as FILE:
STR_RESULTS = os.stat(LOGPATH)
2018-10-08 17:13:00 +02:00
ST_SIZE = STR_RESULTS[6]
FILE.seek(ST_SIZE)
while True:
2018-10-08 20:09:42 +02:00
METRICS = []
2018-10-08 17:13:00 +02:00
WHERE = FILE.tell()
LINE = FILE.readline()
INODENEW = os.stat(LOGPATH).st_ino
if INODE != INODENEW:
break
2018-10-08 17:13:00 +02:00
if not LINE:
time.sleep(1)
FILE.seek(WHERE)
else:
if re_IPV4.match(LINE):
m = re_IPV4.match(LINE)
IP = m.group(1)
elif re_IPV6.match(LINE):
m = re_IPV6.match(LINE)
IP = m.group(1)
if ipadd(IP).iptype() == 'PUBLIC' and IP:
INFO = GI.city(IP)
2018-10-08 17:13:00 +02:00
if INFO is not None:
HASH = geohash.encode(INFO.location.latitude, INFO.location.longitude) # NOQA
2018-10-08 18:00:33 +02:00
COUNT['count'] = 1
2018-10-08 22:25:40 +02:00
GEOHASH['geohash'] = HASH
2018-10-12 20:38:06 +02:00
GEOHASH['host'] = HOSTNAME
GEOHASH['country_code'] = INFO.country.iso_code
GEOHASH['country_name'] = INFO.country.name
GEOHASH['city_name'] = INFO.city.name
2018-10-08 18:00:33 +02:00
IPS['tags'] = GEOHASH
IPS['fields'] = COUNT
2018-10-08 22:25:40 +02:00
IPS['measurement'] = MEASUREMENT
2018-10-08 20:07:59 +02:00
METRICS.append(IPS)
2018-10-08 17:13:00 +02:00
2018-10-08 22:25:40 +02:00
# Sending json data to InfluxDB
try:
CLIENT.write_points(METRICS)
except Exception:
logging.exception("Cannot establish connection with InfluxDB server: ") # NOQA
2018-10-08 17:13:00 +02:00
2018-10-08 18:00:33 +02:00
2018-10-08 22:25:40 +02:00
def main():
# Preparing for reading config file
2018-10-08 18:00:33 +02:00
PWD = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
CONFIG = configparser.ConfigParser()
CONFIG.read('%s/settings.ini' % PWD)
2018-10-08 17:13:00 +02:00
# Getting params from config
GEOIPDB = CONFIG.get('GEOIP', 'geoipdb')
2018-10-08 18:00:33 +02:00
LOGPATH = CONFIG.get('NGINX_LOG', 'logpath')
INFLUXHOST = CONFIG.get('INFLUXDB', 'host')
INFLUXPORT = CONFIG.get('INFLUXDB', 'port')
INFLUXDBDB = CONFIG.get('INFLUXDB', 'database')
INFLUXUSER = CONFIG.get('INFLUXDB', 'username')
MEASUREMENT = CONFIG.get('INFLUXDB', 'measurement')
INFLUXUSERPASS = CONFIG.get('INFLUXDB', 'password')
# Parsing log file and sending metrics to Influxdb
while True:
# Get inode from log file
INODE = os.stat(LOGPATH).st_ino
# Run main loop and grep a log file
if os.path.exists(LOGPATH):
logparse(LOGPATH, INFLUXHOST, INFLUXPORT, INFLUXDBDB, INFLUXUSER, INFLUXUSERPASS, MEASUREMENT, GEOIPDB, INODE) # NOQA
else:
logging.info('Nginx log file %s not found', LOGPATH)
print('Nginx log file %s not found' % LOGPATH)
2018-10-08 17:13:00 +02:00
if __name__ == '__main__':
2018-10-08 22:25:40 +02:00
try:
main()
except Exception:
logging.exception("Exception in main(): ")
2018-10-08 22:25:40 +02:00
except KeyboardInterrupt:
logging.exception("Exception KeyboardInterrupt: ")
2018-10-10 20:48:48 +02:00
sys.exit(0)