# Dallin Regehr <regehr@byu.net>
#
# Version 1.0
#
####################################################################################
#                                                                                  
# Copyright (c) 2008 Dallin P. Regehr<dallinregehrATgmailDOTcom>             
#                                                                                  
# This deteclet is part of the expense reimbursements wizards library.                     
#   Running the default data will show the following two records match:
#   Date                EmpID         Amount
#  2008-07-15	   284	          588.83
#  2008-12-12	   284	          588.83                            
#                                                                                  
####################################################################################
from picalo import *

DETECTLET_STANDARD = 1.0

wizard = '''
<wizard>
  <page>
    Select the table containing the reimbursements made for company expenses.
    The table should contain a date column, an employee id column, and
    a reimbursment amount column.   
    <parameter type="Table" variable="table"/>
  </page>
  <page>
    Select the column containing the reimbursement date:
    <parameter type="Column" table="table" variable="datecol"/>
    Select the column containing the employee ID:
    <parameter type="Column" table="table" variable="empIDcol"/>
  </page>
  <page>
    Select the column containing the reimbursement amount:
    <parameter type="Column" table="table" variable="amountcol"/>
  </page>
</wizard>
'''

RESULTS_TEXT = '''\
    These results are shown in matching pairs--the two records matching on
    amount but different on processed date. This difference means that two expense 
    reimbursements are processed for exactly the same amount. This situation could mean
    that the same reimbursement was processed twice.
    As mentioned in the initial summary, the resulting records in this table
    do not conclusively mean fraud is occuring but merely provide a good 
    place to start an investigation.
'''


def run(table, datecol, empIDcol, amountcol):
  '''A common fraud is for an employee to submit an expense reimbursement into the 
  system more than one time. Often the employee will wait a few days, weeks or even months.
  Most companies process so many reimbursements that a duplicate might slip through
  unnoticed, especially if the two are separated by a large amount of time.
  
  This wizard does the following:
  - Compares each record in the table to every other record
  - Returns all records matching exactly on price
  
  Necessary columns include:
  - Reimbursement date
  - Employee ID number or identification
  - Reimbursement amounts 
  
  It is possible that reimbursements of exactly the same amount could be valid
  (i.e. similar purchases at regular intervals over time). However, the chance that
  such would match exactly not-so-likely. Thus, matches returned
  from this wizard do not guarantee that fraud is occuring but, rather, merely 
  indicate the possibility of fraud and give a place to start a more thorough investigation.
  '''
  # validate the input data
  assert datecol != empIDcol != amountcol, 'The date, employee ID, and reimbursement amount columns must be different.  Please ensure you have selected different columns.'

  # sort the table by date
  Simple.sort(table, True, datecol)
  
  # table with duplicate records 
  results = Table([
    ( datecol,   Date ),
    ( empIDcol,    int),
    ( amountcol,    number),
  ])
  
  # Compare each record to every other
  counter = 0
  
  for rec1 in table:
    counter = counter + 1 # this keeps track of the current record index. We only want to compare to the records following it in the list.
    max = len(table) - counter # number of records remaining in the list
    for recIndex in range(counter, max): # iterate through the remaining records by index number
      if rec1[amountcol] == table[recIndex][amountcol]:
        # Save first match
        rec = results.append()
        rec[datecol] = rec1[datecol]
        rec[empIDcol] = rec1[empIDcol]
        rec[amountcol] = rec1[amountcol]
        # Save second match (the only difference should be the date)
        rec_2 = results.append()
        rec_2[datecol] = table[recIndex][datecol]
        rec_2[empIDcol] = table[recIndex][empIDcol]
        rec_2[amountcol] = table[recIndex][amountcol]
  
  # sort and return
  return results, RESULTS_TEXT

  
  
def example_input():
  '''Returns the eaxmple input table so the user can
     see what their input should look like'''
  import StringIO  # to emulate a file for load_csv
  table = load_csv(StringIO.StringIO(csvdata))
  table.set_type('Date', Date)
  table.set_type('EmployeeID', int)
  table.set_type('Amount',  number)
  return table  
  
  
# This next part is not required, but it's easier to put the example data
# directly in this file so I don't have to worry about directories.
csvdata = '''\
Date,EmployeeID,Amount
1/13/2008,321,6903.6
1/22/2008,516,866.89
1/31/2008,327,305.22
2/9/2008,793,3168.95
2/20/2008,513,6979.86
3/2/2008,275,4052.23
3/13/2008,693,6734.51
3/19/2008,926,3418.17
3/25/2008,972,3164.31
3/31/2008,375,4172.04
4/6/2008,216,7946.88
4/16/2008,199,5829.54
4/16/2008,173,537.48
5/6/2008,986,7681.75
5/16/2008,725,973.42
5/26/2008,275,398.93
6/5/2008,693,1842.88
6/15/2008,926,5588.91
6/25/2008,199,256.57
7/5/2008,528,823.86
7/15/2008,284,588.83
7/25/2008,824,2469.82
8/4/2008,736,141.33
8/14/2008,421,1730.16
8/24/2008,821,8291.06
9/3/2008,321,1905.45
9/13/2008,516,1133.94
9/23/2008,327,5007.19
10/3/2008,793,390.05
10/17/2008,513,4606.82
10/31/2008,275,1098.91
11/14/2008,693,3267.26
11/28/2008,926,7678.35
11/28/2008,972,8061.86
12/2/2008,375,2689.86
12/12/2008,284,588.83
12/22/2008,824,4721.73
1/1/2009,736,5914.4
1/11/2009,421,4995.45
1/21/2009,821,8893.46
1/31/2009,321,2278.73
2/10/2009,516,753.88
2/20/2009,327,8292.58
3/2/2009,793,6998.37
3/12/2009,513,962.02
3/22/2009,245,308.66
4/1/2009,732,3314.07
4/11/2009,623,7295.16
4/21/2009,276,4074.41
5/1/2009,173,811.22
5/11/2009,986,3554.74
5/21/2009,725,3217.02
5/31/2009,275,4245.18
6/10/2009,693,8288.39
6/20/2009,926,5726.55
6/30/2009,972,33.01
7/10/2009,375,6871.48
7/20/2009,216,9748.64
7/30/2009,199,305.53
8/9/2009,173,1967.47
8/15/2009,986,5244.7
8/21/2009,725,590.45
8/27/2009,275,8396.95
9/2/2009,693,3624.89
9/8/2009,926,2340.19
10/2/2009,972,126.4
10/18/2009,375,6813.29
10/18/2009,216,897.51
11/7/2009,199,307.84
11/17/2009,528,3420.3
11/27/2009,284,247.53
12/7/2009,824,4024.56
12/17/2009,736,844.37
12/27/2009,421,324.67
1/6/2010,821,1092.32
'''  