#!/usr/bin/env python3
# Preprocesses geolite ASN database for faster queries
# Copyright (C) 2019 Libor Polčák <ipolcak@fit.vutbr.cz>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import argparse
import os
import os.path
import sys

from time_parser import FormatTimeWrapper

ASN_FIRST_LINE_CSV = "network,autonomous_system_number,autonomous_system_organization\n"
CITY_FIRST_LINE_CSV = "network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider,postal_code,latitude,longitude,accuracy_radius\n"
LOCATION_FIRST_LINE_CSV = "geoname_id,locale_code,continent_code,continent_name,country_iso_code,country_name,subdivision_1_iso_code,subdivision_1_name,subdivision_2_iso_code,subdivision_2_name,city_name,metro_code,time_zone\n"

def write_network(basedir, line, first_line_csv):
    path = "%s/networks" % (basedir, )
    if not (os.path.exists(path)):
        os.makedirs(basedir, exist_ok = True)
        outfile = open(path, "w", encoding="utf-8")
        outfile.write(first_line_csv)
    else:
        outfile = open(path, "a", encoding="utf-8")
    outfile.write(line)

def write_location(basedir, line):
    path = "%s/locations" % (basedir, )
    if not (os.path.exists(path)):
        os.makedirs(basedir, exist_ok = True)
        outfile = open(path, "w", encoding="utf-8")
        outfile.write(LOCATION_FIRST_LINE_CSV)
    else:
        outfile = open(path, "a", encoding="utf-8")
    outfile.write(line)

def process_file(fname, outdir, stdout, ipaddr_sep, plen_group_len, first_line_csv):
    with open(fname, encoding="utf-8") as csv_file:
        second_group_plen = 2*plen_group_len
        next(csv_file) # Skip the initail line (column names)
        for line in csv_file:
            ipblock = line.partition(",")[0]
            netaddr, _, plen = ipblock.partition("/")
            plen = int(plen)
            groups = netaddr.split(ipaddr_sep)
            if plen < plen_group_len:
                basedir = "%s" % (outdir, )
                write_network(basedir, line, first_line_csv)
            elif plen < second_group_plen:
                basedir = "%s/%s" % (outdir, groups[0])
                write_network(basedir, line, first_line_csv)
            else:
                basedir = "%s/%s/%s" % (outdir, groups[0], groups[1])
                write_network(basedir, line, first_line_csv)

def process_file4(fname, outdir, stdout, first_line_csv):
    process_file(fname, "%s/ipv4" % outdir, stdout, ".", 8, first_line_csv)

def process_file6(fname, outdir, stdout, first_line_csv):
    process_file(fname, "%s/ipv6" % outdir, stdout, ":", 16, first_line_csv)

def process_file_loc(fname, outdir, stdout):
    base_outdir = "%s/locations" % outdir
    with open(fname, encoding="utf-8") as csv_file:
        next(csv_file) # Skip the initail line (column names)
        for line in csv_file:
            geoname_id = line.partition(",")[0]
            geoname_id_len = len(geoname_id)
            full_outdir = "%s/%d" % (base_outdir, geoname_id_len)
            pos = 0
            while geoname_id_len - pos > 3:
                pair = geoname_id[pos:pos+2]
                full_outdir = "%s/%s" % (full_outdir, pair)
                pos += 2
            write_location(full_outdir, line)

# Argument handling
def process_args():
    parser = argparse.ArgumentParser(description="TARZAN GeoLite ASN database preprocessor")
    parser.add_argument("--input", "-i", required=True, help="The input directory with the expected name of GeoLite2-X-CSV_YYYYMMDD where X is ASN or City.")
    parser.add_argument("--output", "-o", required=True, help="The output directory where the preprocessed files are going to be stored.")
    return parser.parse_args()


def main(args, stdout):
    geoname = ""
    path = args.input
    while geoname == "":
        path, geoname = os.path.split(path)
    outdir = args.output + "/%s" % geoname
    try:
        os.makedirs(outdir, exist_ok = False)
    except FileExistsError:
        # Check that the directory structure is OK
        ok = True
        subdirs = ["ipv4", "ipv6"]
        if "City" in args.input:
            subdirs.append("locations")
        for subdir in subdirs:
            if not os.path.exists("%s/%s" % (outdir, subdir)):
                ok = False
                break
        if ok:
            sys.stderr.write("Directory structure already exists, aborting.\n")
            sys.exit(1)
    if "ASN" in args.input:
        if not os.path.exists("%s/%s" % (outdir, "ipv4")):
            process_file4(args.input + "/GeoLite2-ASN-Blocks-IPv4.csv", outdir, stdout,
                ASN_FIRST_LINE_CSV)
        if not os.path.exists("%s/%s" % (outdir, "ipv6")):
            process_file6(args.input + "/GeoLite2-ASN-Blocks-IPv6.csv", outdir, stdout,
                ASN_FIRST_LINE_CSV)
    elif "City" in args.input:
        if not os.path.exists("%s/%s" % (outdir, "ipv4")):
            process_file4(args.input + "/GeoLite2-City-Blocks-IPv4.csv", outdir, stdout,
                CITY_FIRST_LINE_CSV)
        if not os.path.exists("%s/%s" % (outdir, "ipv6")):
            process_file6(args.input + "/GeoLite2-City-Blocks-IPv6.csv", outdir, stdout,
                CITY_FIRST_LINE_CSV)
        if not os.path.exists("%s/%s" % (outdir, "locations")):
            process_file_loc(args.input + "/GeoLite2-City-Locations-en.csv", outdir, stdout)

if __name__ == "__main__":
    args = process_args()
    main(args, sys.stdout)
