"""
 Tool Name:  NameCheck
 Description: This tool identifies incorrectly named watersheds.
 Author: Patrick Longley (plongley@usgs.gov)
 Created: 08/27/2020
 Language: Written in python3 (arcpro).  Modified to work in arcmap (python2)
    Proofread 09/08/2020.
    metadata 20201005,
    fields as constants 20201005,
    check fields in update parameters 20201005
 TODO: Check frontal feature is a gnis name (doesn't have to be in watershed)
 TODO: need to check fields in NHD during validation (fields within a job are a different schema)
 TODO: Flags duplicates >>> false flags if running on > HUC8 scale
 TODO: Recommends bad names if run on < HUC8 scale (duplicates names in HUC8)
 TODO: deal with "None" in potential_names field (None in pandas becomes "None" as a string in arc)
"""

import sys
import os
import arcpy
import numpy as np
import re
import wbd_f
import wbd_params
import wbd_c

# Constants
PYTHON_VERSION = sys.version_info.major
MEMORY_FPATH = wbd_f.get_memoryfpath(PYTHON_VERSION)
NAME_FLAG = wbd_c.F_NAME + wbd_c.FLAG_SUFFIX
WHITESPACE = r'^\s*$'

class NameCheck(object):
    """
    Checks WBD names and creates an ESRI table with the results of the checks.

    Args:
        polygonfc_updated (polygon feat class): Polygon feature class representing the updated WBD polygons.
            Must contain a HUC code field.
            Must contain a name field.
            Optional hutype field.
        gnis_fc (point feature class):  Point feature containing GNIS data.
        nhdwaterbody_fc (polygon feature class): Polygon feature class containing NHD waterbodies.
            Must contain a gnis_name field.
        nhdline_fc (line feature class): Line feature class containing NHD flowlines.
            Must contain a gnis_name field.

    Outputs:
        returns: None
        output parameter: out_table is an ESRI table that is outputted.

    """

    def __init__(self):
        """
        Initialize variables
        """
        self.label       = "5) Polygon: Name Check"
        self.description = "Checks WBD watershed names and creates an ESRI table with the results of the checks."
        self.callfrom_pyt = True
        self.category = 'Attribution'

    def getParameterInfo(self):
        """
        Define the parameters for use in arcmap/pro.
        """
        return [wbd_params.polygonfc_updated,
                wbd_params.gnis_fc,
                wbd_params.nhdwaterbody_fc,
                wbd_params.nhdline_fc,
                wbd_params.add_updatecolumns]

    def updateMessages(self, params):
        """
        Modify the messages created by internal validation for each tool
        parameter.This method is called after internal validation.
        """
        MESSAGE = "Feature class must contain the following fields: {}."
        polygon_fields = [wbd_c.F_NAME, wbd_c.F_HUTYPE]
        if params[0].altered:
            polygon_fc = params[0].valueAsText
            if (not wbd_f.check_fieldsexist(polygon_fc, polygon_fields) or
                    not wbd_f.get_hucfield(polygon_fc)):
                params[0].setErrorMessage(MESSAGE.format(', '.join(polygon_fields + ['huc'])))
        if params[1].altered:
            if not wbd_f.check_fieldsexist(params[1].valueAsText, [wbd_c.F_FEATURE_NAME]):
                params[1].setErrorMessage(MESSAGE.format(wbd_c.F_FEATURE_NAME))
        if params[2].altered:
            if not wbd_f.check_fieldsexist(params[2].valueAsText, [wbd_c.F_GNIS_NAME]):
                params[2].setErrorMessage(MESSAGE.format(wbd_c.F_GNIS_NAME))
        if params[3].altered:
            if not wbd_f.check_fieldsexist(params[3].valueAsText, [wbd_c.F_GNIS_NAME]):
                params[3].setErrorMessage(MESSAGE.format(wbd_c.F_GNIS_NAME))

    def create_fls(self):
        """
        Create feature layers for nhd data. Only include named features.
        """
        where = """{} IS NOT NULL"""
        where_fl = where.format(arcpy.AddFieldDelimiters(self.nhdline_fc, wbd_c.F_GNIS_NAME))
        where_wb = where.format(arcpy.AddFieldDelimiters(self.nhdwaterbody_fc, wbd_c.F_GNIS_NAME))
        where_poly = """{} <> '' AND {} <> ''  AND {} <> ''  AND {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL""".format(
            arcpy.AddFieldDelimiters(self.polygonfc_updated, self.hucfield),
            arcpy.AddFieldDelimiters(self.polygonfc_updated, wbd_c.F_HUTYPE),
            arcpy.AddFieldDelimiters(self.polygonfc_updated, wbd_c.F_NAME),
            arcpy.AddFieldDelimiters(self.polygonfc_updated, self.hucfield),
            arcpy.AddFieldDelimiters(self.polygonfc_updated, wbd_c.F_HUTYPE),
            arcpy.AddFieldDelimiters(self.polygonfc_updated, wbd_c.F_NAME),
        )
        with arcpy.EnvManager(overwriteOutput=True):
            self.nhdline_fl = arcpy.MakeFeatureLayer_management(self.nhdline_fc, 'flowline_fl', where_clause=where_fl)
            self.nhdwaterbody_fl = arcpy.MakeFeatureLayer_management(self.nhdwaterbody_fc, 'waterbody_fl', where_clause=where_wb)
            self.polygon_fl = arcpy.MakeFeatureLayer_management(self.polygonfc_updated, 'polygon_fl', where_clause=where_poly)

    def join_intersect(self):
        """
        This function spatially joins all GNIS, NHD waterbody, and NHD flowlines features to
        the WBD polygon featureclass as 3 seperate fields.
        These fields represent potential names.
        """
        sjfl_fpath = os.path.join(MEMORY_FPATH, 'sj1')
        sjwb_fpath = os.path.join(MEMORY_FPATH, 'sj2')
        sjgnis_fpath = os.path.join(MEMORY_FPATH, 'sj3')
        self.to_keep.append(wbd_c.F_GNIS_NAME)
        with arcpy.EnvManager(overwriteOutput=True):
            # spatial join nhd flowlines
            flowline_intersect = arcpy.Intersect_analysis([self.polygon_fl, self.nhdline_fl], sjfl_fpath)
            self.flowlineintersect_fl = arcpy.MakeFeatureLayer_management(flowline_intersect, 'flowlineintersect_fl')
            # spatial join nhd watebody
            waterbody_intersect = arcpy.Intersect_analysis([self.polygon_fl, self.nhdwaterbody_fl], sjwb_fpath)
            self.waterbodyintersect_fl = arcpy.MakeFeatureLayer_management(waterbody_intersect, 'sjwb_fl')
            # spatial join GNIS features
            self.sjgnis_fc = wbd_f.spatialjoin_singlefield(self.polygon_fl, self.gnis_fc, wbd_c.F_FEATURE_NAME, wbd_c.F_GNIS_NAME, sjgnis_fpath)
            self.gnis_table = arcpy.TableToTable_conversion(self.sjgnis_fc, arcpy.env.workspace, 'gnis_table')
        wbd_f.delete_extrafields(self.flowlineintersect_fl, self.to_keep)
        wbd_f.delete_extrafields(self.waterbodyintersect_fl, self.to_keep)
        wbd_f.delete_extrafields(self.gnis_table, self.to_keep)

    def join_names(self):
        """
        Joins GNIS names onto the table using NHD names.
        This checks that NHD names are in fact valid GNIS names.
        """
        with arcpy.EnvManager(overwriteOutput=True):
            # only keep flowline features with valid gnis names
            jointable_1 = arcpy.AddJoin_management(
                self.flowlineintersect_fl,
                wbd_c.F_GNIS_NAME,
                self.gnis_fc,
                wbd_c.F_FEATURE_NAME,
                "KEEP_COMMON"
            )
            tj1 = arcpy.TableToTable_conversion(jointable_1, arcpy.env.workspace, 'tj1')
            # only keep waterbody features with valid gnis names
            jointable_2 = arcpy.AddJoin_management(
                self.waterbodyintersect_fl,
                wbd_c.F_GNIS_NAME,
                self.gnis_fc,
                wbd_c.F_FEATURE_NAME,
                "KEEP_COMMON"
            )
            tj2 = arcpy.TableToTable_conversion(jointable_2, arcpy.env.workspace, 'tj2')
        wbd_f.delete_extrafields(tj1, self.to_keep)
        wbd_f.delete_extrafields(tj2, self.to_keep)
        arcpy.Append_management([tj1, tj2], self.gnis_table, 'NO_TEST')
        arcpy.Delete_management(self.flowlineintersect_fl)
        arcpy.Delete_management(self.waterbodyintersect_fl)
        arcpy.Delete_management(tj1)
        arcpy.Delete_management(tj2)

    def create_df(self):
        """
        Convert feature class to structured numpy array then to pandas df.
        Formats df so that all of the GNIS names are in a single column.
        """
        # create df
        cols = [self.hucfield, wbd_c.F_NAME, wbd_c.F_GNIS_NAME, NAME_FLAG]
        if wbd_c.F_HUTYPE in self.fields:
            cols = cols + [wbd_c.F_HUTYPE]
        self.df = wbd_f.create_df(self.gnis_table, cols)
        arcpy.Delete_management(self.gnis_table)
        self.df.drop_duplicates()
        # split name into major feature, minor feature, and prefix
        self.get_majorfeature()
        self.get_prefixes()
        # strip white space and remove nans
        self.df = self.df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
        self.df = self.df.fillna('')

    def get_majorfeature(self):
        """
        Split name into major and minor features.
        """
        # copy name into major feature column
        self.df[wbd_c.F_MAJORFEATURE] = self.df[wbd_c.F_NAME]
        # Major/ minor features
        if PYTHON_VERSION == 3:
            minorfeatures = re.compile(r'(\D+(?=-(?!Frontal)))')  # alphabetic characters followed by a -, not followed by Frontal (don't capture -)
            frontalfeatures = re.compile(r'-Frontal\s*(.+)')
        else:
            minorfeatures = r'(\D+(?=-(?!Frontal)))'  # alphabetic characters followed by a -, not followed by Frontal
            frontalfeatures = r'-Frontal\s*(.+)'
        self.df[wbd_c.F_MINORFEATURE] = self.df[wbd_c.F_NAME].str.extract(minorfeatures)
        self.df[wbd_c.F_FRONTALFEATURE] = self.df[wbd_c.F_NAME].str.extract(frontalfeatures).astype(object)
        self.df[wbd_c.F_MAJORFEATURE] = self.df[wbd_c.F_MAJORFEATURE].str.replace(minorfeatures, '').str.lstrip('-')
        self.df[wbd_c.F_MAJORFEATURE] = self.df[wbd_c.F_MAJORFEATURE].str.replace(frontalfeatures, '')

    def get_prefixes(self):
        """
        Remove prefixes from major and minor features. Add prefixes to new column.
        """
        # prefixes to new column
        self.df[wbd_c.F_PREFIXTEMP] = self.df[wbd_c.F_NAME].str.extract(wbd_f.prefix_regex)
        # remove prefixes from major and minor names
        self.df[wbd_c.F_MAJORFEATURE] = self.df[wbd_c.F_MAJORFEATURE].str.replace(wbd_f.prefix_regex2, '').astype(object)
        self.df[wbd_c.F_MINORFEATURE] = self.df[wbd_c.F_MINORFEATURE].str.replace(wbd_f.prefix_regex2, '').astype(object)
        # remove huccodes from major and minor features
        self.df[wbd_c.F_MAJORFEATURE] = self.df[wbd_c.F_MAJORFEATURE].str.replace(r'\d+-', '', regex = True)
        self.df[wbd_c.F_MINORFEATURE] = self.df[wbd_c.F_MINORFEATURE].str.replace(r'\d+-', '', regex = True)

    def check_gnis(self):
        """
        Checks to make sure major and minor features are both valid GNIS_names.
        """
        # Group data frame by HUC and concatenate all potential names for each HUC
        agg_df = self.df.groupby([self.hucfield, wbd_c.F_NAME])[wbd_c.F_GNIS_NAME].apply(wbd_f.df_stringconcat).rename("potential_names").reset_index()
        self.df = self.df.merge(agg_df)
        def check_namelist(row):
            """
            Checks major feature is a valid GNIS name or a valid (correct length) HUC code.
            Checks minor feature is a valid GNIS name.
            """
            pattern = re.compile(r'^\d{}$'.format(''.join(['{', self.hudigit, '}'])))
            if ((row[wbd_c.F_MAJORFEATURE] in row[wbd_c.F_POTENTIALNAMES] or
                 pattern.match(row[wbd_c.F_MAJORFEATURE])) and
                 row[wbd_c.F_MINORFEATURE] in row[wbd_c.F_POTENTIALNAMES]):
                return row[NAME_FLAG]
            else:
                return wbd_f.update_flag(row[NAME_FLAG], wbd_c.WRONG_FLAG)
        self.df[NAME_FLAG] = self.df.apply(check_namelist, axis=1)

    def check_prefixes(self):
        """
        Check that prefixes are used correctly.
        Check that prefixes not used with hyphenated naming structures.
        """
        # acceptable prefix combinations
        prefixes = {'',
                    'Headwaters', 'Outlet', 'Headwaters,Outlet',
                    '-', '-,Headwaters', '-,Outlet', '-,Headwaters,Outlet',
                    'Lower,Upper',
                    'Lower,Middle,Upper', 'Headwaters,Lower,Middle,Upper', 'Lower,Middle,Outlet,Upper', 'Headwaters,Lower,Middle,Outlet,Upper'}
        # list of all prefixes associated with a major feature
        agg_df = self.df.groupby(wbd_c.F_MAJORFEATURE)[wbd_c.F_PREFIXTEMP].apply(wbd_f.df_stringconcat).rename(wbd_c.F_PREFIX).reset_index()
        self.df = self.df.merge(agg_df)
        # del (self.df[wbd_c.F_PREFIXTEMP])
        # check list of prefixes is valid
        def check_prefixlist(row):
            if row[wbd_c.F_PREFIX] in prefixes:
                return row[NAME_FLAG]
            else:
                return wbd_f.update_flag(row[NAME_FLAG], wbd_c.WRONG_FLAG)
        self.df[NAME_FLAG] = self.df.apply(check_prefixlist, axis=1)

    def check_uniqueness(self):
        """
        Check that each name is only used once.
        """
        del(self.df[wbd_c.F_GNIS_NAME])
        self.df = self.df.drop_duplicates()
        self.df['duplicated'] = self.df.duplicated(subset=wbd_c.F_NAME, keep=False)
        self.df.loc[
            np.logical_and(
                self.df['duplicated'] == True,
                np.isin(self.df[NAME_FLAG], (wbd_c.CORRECT_FLAG, wbd_c.NOTCHECKED_FLAG))
            ),
            NAME_FLAG
        ] = wbd_c.WRONG_FLAG
        del(self.df['duplicated'])

    def check_huccode(self, row):
        """
        If a HUC code (number) is in the name, check that it is the correct HUC code.
        Also check that the HUC code is not used when features are available.
        """
        numbers = re.findall(r'(\d+)', row[wbd_c.F_NAME])
        if row[wbd_c.F_POTENTIALNAMES] in ('', 'None'):
            n_names = 0
        else:
            n_names = len(re.sub(r',*None','', row[wbd_c.F_POTENTIALNAMES]).strip(',').split(','))
        if not numbers:
            return row[NAME_FLAG]
        # issue if multiple names are available, but huc code is used
        elif n_names > 1:
            return wbd_c.WRONG_FLAG
        # issue if huccode is major feature, but a name is available
        elif row[wbd_c.F_MAJORFEATURE] == numbers[0] and n_names == 1:
            return wbd_c.WRONG_FLAG
        # issue if number in name doesn't match huc-code
        elif numbers[0] != row[self.hucfield]:
            return wbd_c.WRONG_FLAG
        else:
            return row[NAME_FLAG]

    def check_frontal(self, row):
        """
        If frontal is in the name make sure the unit is frontal (HUType = F).
        If HUType = F, make sure frontal is in name.
        """
        if ((row[wbd_c.F_FRONTALFEATURE] != '' and row[wbd_c.F_HUTYPE] != 'F') or
            (row[wbd_c.F_FRONTALFEATURE] == '' and row[wbd_c.F_HUTYPE] == 'F')):
            return wbd_c.WRONG_FLAG
        else:
            return row[NAME_FLAG]

    def check_blank(self, row):
        """
        Check if name is blank or null.
        """
        if not row[wbd_c.F_NAME] or re.match(WHITESPACE, row[wbd_c.F_NAME]):
            return wbd_c.NODATA_FLAG
        else:
            return row[NAME_FLAG]

    def saveas_table(self):
        """
        Save df as ESRI table.
        """
        data_types = dict(zip(list(self.df.columns), ['<U255'] * len(self.df.columns)))
        data_types[self.hucfield] = ''.join(['<U', self.hudigit])
        data_types[wbd_c.F_HUTYPE] = '<U1'
        data_types[NAME_FLAG] = '<U' + str(int(wbd_c.FLAG_LENGTH/2))
        data_types = [(k, v) for k, v in data_types.items()]
        arr = np.zeros(len(self.df), dtype=data_types)
        for col in list(self.df.columns):
            arr[col] = self.df[col]
        self.out_table = os.path.join(MEMORY_FPATH, 'out_table')
        if arcpy.Exists(self.out_table):
            arcpy.Delete_management(self.out_table)
        arcpy.da.NumPyArrayToTable(arr, self.out_table)  #doesn't respect overwrite environmental setting, no return?

    def execute(self, params, messages):
        """
        Executes the above functions to check WBD polygon names.
        """
        # parameters
        if self.callfrom_pyt:
            self.polygonfc_updated = params[0].valueAsText
            self.gnis_fc = params[1].valueAsText
            self.nhdwaterbody_fc = params[2].valueAsText
            self.nhdline_fc = params[3].valueAsText
            self.add_updatecolumns = params[4].value
        else:
            self.polygonfc_updated = params[0]
            self.gnis_fc = params[1]
            self.nhdwaterbody_fc = params[2]
            self.nhdline_fc = params[3]
            self.add_updatecolumns = params[4]
        namecheck_fields = [
            NAME_FLAG,
            wbd_c.F_MAJORFEATURE,
            wbd_c.F_FRONTALFEATURE,
            wbd_c.F_PREFIX,
            wbd_c.F_POTENTIALNAMES
            ]
        # field names for polygon fc
        self.fields = [x.name for x in arcpy.ListFields(self.polygonfc_updated)]
        if NAME_FLAG not in self.fields:
            arcpy.AddField_management(self.polygonfc_updated, NAME_FLAG, 'TEXT', field_length=wbd_c.FLAG_LENGTH)
            arcpy.CalculateField_management(self.polygonfc_updated, NAME_FLAG, "'{}'".format(wbd_c.CORRECT_FLAG), 'PYTHON')
        for f in self.fields:
            if f in namecheck_fields[1:]:
                arcpy.DeleteField_management(self.polygonfc_updated, f)
        _, hudigits = wbd_f.sort_polygons([self.polygonfc_updated])
        self.hudigit = hudigits[0]
        self.hucfield = wbd_c.HUC + self.hudigit
        self.to_keep = [self.hucfield, wbd_c.F_NAME, wbd_c.F_HUTYPE, NAME_FLAG]
        self.create_fls()
        # perform joins to get which GNIS and NHD features are in each huc
        self.join_intersect()
        self.join_names()
        # convert feature class to pandas df and format df
        self.create_df()
        # perform checks
        self.check_gnis()
        self.check_prefixes()
        self.check_uniqueness()
        self.df[NAME_FLAG] = self.df.apply(self.check_huccode, axis=1)
        if wbd_c.F_HUTYPE in self.fields:
            self.df[NAME_FLAG] = self.df.apply(self.check_frontal, axis=1)
        self.df[NAME_FLAG] = self.df.apply(self.check_blank, axis=1)
        # save df as ESRI table
        self.saveas_table()
        arcpy.DeleteField_management(self.polygon_fl, NAME_FLAG)
        if self.add_updatecolumns:
            arcpy.JoinField_management(self.polygon_fl, self.hucfield, self.out_table, self.hucfield, namecheck_fields)
        else:
            arcpy.JoinField_management(self.polygon_fl, self.hucfield, self.out_table, self.hucfield, namecheck_fields[0])
        arcpy.Delete_management(self.out_table)

if __name__ == '__main__':
    """
    Execute as standalone script.
    """
    arcpy.env.workspace = r'D:\OneDrive - DOI\WBD\AK\AK_19020201\AK_19020201_wbd_tools.gdb'
    polygonfc_updated = r'WBD\WBDHU10_20210920'
    nhd_flowline = r'D:\OneDrive - DOI\WBD\AK\AK_19020201\ak_19020201_review_v8_extra.gdb\data\NHDFlowline_19020201'
    nhd_waterbody = r'D:\OneDrive - DOI\WBD\AK\AK_19020201\ak_19020201_review_v8_extra.gdb\data\NHDWaterbody_19020201'
    gnis_fc = r"B:\BaseData\GNIS_DL04162012.gdb\GNIS\AK_GNIS"
    add_updatecolumns = True
    params = (polygonfc_updated, gnis_fc, nhd_waterbody, nhd_flowline, add_updatecolumns)
    namecheck = NameCheck()
    namecheck.callfrom_pyt = False
    namecheck.execute(params, None)


