#!/usr/bin/python

from picalo import *
from picalo.base.Column import _ColumnLoader

DETECTLET_STANDARD = 1.0

wizard = '''
<wizard>
  <page>
    Select the employee (internal) table:
    <parameter type="Table" variable="employeetable"/>
    Select the vendor (external) table:
    <parameter type="Table" variable="vendortable"/>
  </page>
  <page>
    Select the column in the employee table
    to match:
    <parameter type="Column" table="employeetable" variable="employeecol"/>
    Select the column in the vendor table to match:
    <parameter type="Column" table="vendortable" variable="vendorcol"/>
  </page>
  <page>
    Please enter the minimum percentage match that signals a possible
    fuzzy match.  For example, an address match of 20 percent or better
    typically signals a possible match.  
    <parameter type="int" min="0" max="100" default="30" variable="threshold"/>
  </page>
</wizard>
'''

RESULTS_TEXT = '''\
    The displayed table contains records that matched over the threshold.
    If not enough (or no) records matched, try decreasing the minimum
    match percentage.  If too many records matched, try increasing the 
    minimum.
'''


def run(employeetable, employeecol, vendortable, vendorcol, threshold):
  '''A common fraud is for an employee within an organization to set 
     up a dummy company and use his or her home address as the base for
     the company.  Normally, employees and vendors should have drastically
     different addresses.
     
     This wizard uses the fuzzy match algorithm to match an internal
     (typically the employee) table and an external (typically the vendor)
     table.
     
     The typical match column is the employee street address column to
     the vendor street address column.      
     Since this routine matches a single column to a single column, you
     may want to create a calculated column in each table that combines
     the street address, city, and state.  However, often the street 
     address alone is enough to find matches effectively.
  '''
  # validate the input data
  assert employeetable != vendortable, 'The employee and vendor tables cannot be the same.  Please ensure you have selected two different tables.'
  
  # perform the match
  pct = threshold / 100.0
  expression = "Simple.fuzzymatch(record1['" + employeecol + "'], record2['" + vendorcol + "'], 5) >= " + str(pct)
  matches = Simple.custom_match(employeetable, vendortable, expression)
  
  # create the new table
  newcols = \
    [ _ColumnLoader('FuzzyMatch', unicode) ] + \
    [ col.get_columnloader() for col in employeetable.get_columns() ] +\
    [ col.get_columnloader() for col in vendortable.get_column_names() ]
  d = {}
  for i in range(len(newcols)):  # ensure we don't have two columns with the same name
    name = newcols[i].name
    counter = 2
    while d.has_key(name):
      name = newcols[i].name + '_%i' % counter
      counter += 1
    d[name] = name
    newcols[i].name = name
  joined = Table(newcols)
  
  # populate the new table with matching records
  for counter, match in enumerate(matches):
    show_progress('Joining...', float(counter) / len(matches))
    matchpercent = Simple.fuzzymatch(employeetable[match[0]][employeecol], vendortable[match[1]][vendorcol], 5)
    fields = [ matchpercent ] + employeetable[match[0]] + vendortable[match[1]]
    joined.append(fields)
  clear_progress()
  
  # sort and return
  Simple.sort(joined, False, 'FuzzyMatch')
  return joined, RESULTS_TEXT

  