####################################################################################
#                                                                                  
# Copyright (c) 2008 Dallin P. Regehr <dallinregehrATgmailDOTcom>             
#                                                                                  
# This deteclet is part of the Kickbacks wizards library.                     
#                                    
#                                                                                  
####################################################################################
# UPDATES:
#    
#   June 13, 2008   Updated the descriptions and added code for the progress analyzer
#
# STATUS: Reviewed and Approved, Matt Hillary, June 13, 2008
#           
#
# IDEAS/QUESTIONS/NOTES:
#
# The "what is" of this script is describe below. It is fully functional and 
# includes data for 3 products and 4 vendors. Vendor 2152 has committed fraud
# with product 984732. It should show a 97.84% greater growth in price over time 
# compared to the average price increase of the product across all vendors.
#
####################################################################################

from picalo import *


DETECTLET_STANDARD = 1.0

wizard = '''
<wizard>
  <page>
    Select the TABLE of product purchases containing product purchase dates, product ID,
    vendor ID, and purchase price:
    <parameter type="Table" variable="purchasetable"/>
  </page>
  <page>
    Select the column with the product purchase date:
    <parameter type="Column" table="purchasetable" variable="purchaseDates"/>
    Select the column with the product IDs:
    <parameter type="Column" table="purchasetable" variable="productID"/>
  </page>
  <page>
    Select the column with the vendor IDs:
    <parameter type="Column" table="purchasetable" variable="vendorID"/>
    Select the column with the product purchase prices:
    <parameter type="Column" table="purchasetable" variable="purchasePrices"/>
  </page>
  <page>
    Input the acceptable percent variation of the slope. This controls the
    sensitivity of the calculation to flagging potential fraud records. Records
    that vary from the total average price paid by more than this percent will be flagged.
    Enter percent as a number between 0 and 100.
    <parameter type="int" min="0" variable="varFromAverage"/>
  </page>
</wizard>
'''

RESULTS_TEXT = '''\
    This table contains all the vendor's product price trends over time. 
    The trend is shown as the slope. A positive slope indicates prices are
    increasing while a negative slope represents decreasing prices over time.
    
    Generally, the slopes of each vendor should be relatively similar. Thus, the difference
    from the average slope should be relatively small as well. A large difference between 
    an individual vendor and the rest of the group is a good indicator of a possible fraud.
    
    If the slope is significantly different from the average (usually greater), then the percent 
    difference is shown. If too many or too few records are shown, run the detectlet again and 
    modify the acceptable percent variation.
'''


def run(purchasetable, purchaseDates, productID, vendorID, purchasePrices, varFromAverage):
  '''Kickbacks can be detected by analyzing the price change of products bought from a specific 
  vendor over a period of time. Prices of products will generally increase over time; however, 
  vendors colluding with their customers may comfortably raise prices at a higher rate and higher 
  percentage than vendors not colluding. 
       
  Often, this fraud goes unnoticed because many product prices increase over time. 
  A proliferation of purchased products and services can also make this fraud 
  difficult to spot.  
     
  This detectlet organizes product purchase data by date and calculates the average price increase 
  over time using a linear regression. If one vendor's price increase is significantly larger than another
  vendor's price increase of the same product, this may be evidence of fraud.
     
  Note: This detectlet assumes that the purchase dates and prices have been normalized along the 
  time axis (i.e. the purchase dates occur at regular intervals (every two weeks etc). Otherwise, 
  bunches of data points together chronologically will skew the slope calculation. Picalo can do 
  this for you if necessary.
  '''
  # validate the input data
  assert purchaseDates !=  productID != vendorID != purchasePrices, 'Selected columns cannot be the same. Please go back and select different columns from the table.'
   
  flaggedSlopes = []
  
  # organize by product
  StratVendorProduct = Grouping.stratify_by_value(purchasetable, productID)
  for product in StratVendorProduct:
    StratVendor = Grouping.stratify_by_value(product, vendorID)
    vendorPriceSlopes = []
    slopeTotals = []
    # iterate through each product for the vendor and calculate the slope (rate of change in price)
    for vendor in StratVendor:
      # sort ascending by date
      Simple.sort(vendor, True, purchaseDates)  
      
      #calculate the slope of the price change and save corresponding data to a list
      reg = Trending.regression(vendor, purchasePrices) # this returns a table with one record so to get the slope the first record, first column must be selected (i.e. reg[0][0])
      vendorPriceSlopes.append([vendor[vendorID], product[productID], reg[0][0]])
      slopeTotals.append(reg[0][0])
    
    # calculate average purchase price for product across all vendors
    averageSlope = 0.0
    averageSlope = sum(slopeTotals) / len(slopeTotals)
    
    # compare each vendor's product price slope against the average. Flagged if outside specificed % variation
    for vendor, product, slope in vendorPriceSlopes:
      if slope > averageSlope:
        difference = slope - averageSlope
        #calculate the % from the mean
        percentDifferent = difference / averageSlope
        # if the difference is outside the user given variation then flag the purchase information
        if difference > (averageSlope * varFromAverage * .01): # convert to decimal %
          flaggedSlopes.append([vendor, product, slope, mean(slopeTotals), difference, (percentDifferent * 100)])
        
  # create the new table
  resultsTable = Table([
    (vendorID, unicode),
    (productID, unicode),
    ('SlopeOfPriceChange', unicode),
    ('AllVendorsAverageSlope', unicode),
    ('UnitDifferenceFromAverage', unicode),
    ('PercentDiffFromAverage', unicode),
  ])
  
  # populate the new table with matching records
  for counter, match in enumerate(flaggedSlopes):
    show_progress('Populating table...', float(counter) / len(match))
    rec = resultsTable.append()
    rec[vendorID] = match[0][0] #only the vendor ID from the vendor record is wanted
    rec[productID] = match[1][0] # same for product ID
    rec['SlopeOfPriceChange'] = match[2]
    rec['AllVendorsAverageSlope'] = match[3]
    rec['UnitDifferenceFromAverage'] = match[4]
    rec['PercentDiffFromAverage'] = match[5]
  clear_progress()
  
  #sort by product number and return
  Simple.sort(resultsTable, True, productID)
  
  return resultsTable, RESULTS_TEXT
  
def example_input():
  import StringIO  # to emulate a file for load_csv
  table = load_csv(StringIO.StringIO(csvdata))
  table.set_type('PurchaseDates', Date)
  table.set_type('PurchasePrices', number)
  return table


# This next part is not required, but it's easier to put the example data
# directly in this file so I don't have to worry about directories.
csvdata = '''\
PurchaseDates,ProductID,VendorID,PurchasePrices
1/1/2008,984732,6534,1230.17
1/15/2008,984732,2152,1254.77
1/29/2008,984732,4352,1254.90
2/12/2008,984732,3543,1267.45
2/26/2008,984732,4352,1280.12
3/11/2008,984732,6534,1292.92
3/25/2008,984732,3543,1305.85
4/8/2008,984732,4352,1318.91
4/22/2008,984732,2152,1441.34
5/6/2008,984732,6534,1345.42
5/20/2008,984732,4352,1358.87
6/3/2008,984732,3543,1372.46
6/17/2008,984732,2152,1560.15
7/1/2008,984732,6534,1400.05
7/15/2008,984732,3543,1414.05
7/29/2008,984732,6534,1428.19
8/12/2008,984732,4352,1442.47
8/26/2008,984732,2152,1722.53
9/9/2008,984732,6534,1471.46
9/23/2008,984732,4352,1486.18
10/7/2008,984732,2152,1827.97
10/21/2008,984732,3543,1516.05
11/4/2008,984732,3543,1531.21
11/18/2008,984732,6534,1546.52
12/2/2008,984732,4352,1561.99
12/16/2008,984732,6534,1577.61
12/30/2008,984732,2152,2058.59
1/13/2009,984732,3543,1609.32
1/27/2009,984732,6534,1625.41
2/10/2009,984732,3543,1641.67
2/24/2009,984732,4352,1658.08
3/10/2009,984732,2152,2272.85
3/24/2009,984732,6534,1691.41
4/7/2009,984732,4352,1708.32
4/21/2009,984732,3543,1725.41
5/5/2009,984732,6534,1742.66
5/19/2009,984732,3543,1760.09
6/2/2009,984732,2152,2559.60
6/16/2009,984732,4352,1795.47
6/30/2009,984732,3543,1813.42
7/14/2009,984732,2152,2716.26
7/28/2009,984732,6534,1849.87
8/11/2009,984732,4352,1868.37
8/25/2009,984732,2152,2882.52
9/8/2009,984732,3543,1905.92
9/22/2009,984732,2152,2998.98
10/6/2009,984732,4352,1944.23
10/20/2009,984732,6534,1963.68
11/3/2009,984732,2152,3182.54
11/17/2009,984732,4352,2003.15
12/1/2009,984732,3543,2023.18
12/15/2009,984732,2152,3377.33
12/29/2009,984732,6534,2063.84
1/1/2008,753612,2152,2084.48
1/15/2008,753612,4352,2105.33
1/29/2008,753612,6534,2126.38
2/12/2008,753612,2152,2147.64
2/26/2008,753612,6534,2169.12
3/11/2008,753612,4352,2190.81
3/25/2008,753612,3543,2212.72
4/8/2008,753612,2152,2234.85
4/22/2008,753612,6534,2257.19
5/6/2008,753612,3543,2279.77
5/20/2008,753612,4352,2302.56
6/3/2008,753612,3543,2325.59
6/17/2008,753612,6534,2348.85
7/1/2008,753612,3543,2372.33
7/15/2008,753612,2152,2396.06
7/29/2008,753612,4352,2420.02
8/12/2008,753612,3543,2444.22
8/26/2008,753612,6534,2468.66
9/9/2008,753612,3543,2493.35
9/23/2008,753612,2152,2518.28
10/7/2008,753612,4352,2543.46
10/21/2008,753612,3543,2568.90
11/4/2008,753612,6534,2594.59
11/18/2008,753612,6534,2620.53
12/2/2008,753612,3543,2646.74
12/16/2008,753612,2152,2673.21
12/30/2008,753612,4352,2699.94
1/13/2009,753612,4352,2726.94
1/27/2009,753612,6534,2754.21
2/10/2009,753612,3543,2781.75
2/24/2009,753612,2152,2809.57
3/10/2009,753612,4352,2837.66
3/24/2009,753612,6534,2866.04
4/7/2009,753612,4352,2894.70
4/21/2009,753612,2152,2923.65
5/5/2009,753612,3543,2952.88
5/19/2009,753612,2152,2982.41
6/2/2009,753612,3543,3012.23
6/16/2009,753612,2152,3042.36
6/30/2009,753612,4352,3072.78
7/14/2009,753612,6534,3103.51
7/28/2009,753612,4352,3134.54
8/11/2009,753612,3543,3165.89
8/25/2009,753612,2152,3197.55
9/8/2009,753612,6534,3229.52
9/22/2009,753612,2152,3261.82
10/6/2009,753612,4352,3294.44
10/20/2009,753612,6534,3327.38
11/3/2009,753612,2152,3360.65
11/17/2009,753612,3543,3394.26
12/1/2009,753612,6534,3428.20
12/15/2009,753612,3543,3462.49
12/29/2009,753612,4352,3497.11
1/1/2008,341235,3543,3532.08
1/15/2008,341235,4352,3567.40
1/29/2008,341235,3543,3603.08
2/12/2008,341235,2152,3639.11
2/26/2008,341235,4352,3675.50
3/11/2008,341235,6534,3712.25
3/25/2008,341235,2152,3749.38
4/8/2008,341235,3543,3786.87
4/22/2008,341235,2152,3824.74
5/6/2008,341235,6534,3862.99
5/20/2008,341235,3543,3901.62
6/3/2008,341235,4352,3940.63
6/17/2008,341235,6534,3980.04
7/1/2008,341235,3543,4019.84
7/15/2008,341235,4352,4060.04
7/29/2008,341235,2152,4100.64
8/12/2008,341235,6534,4141.64
8/26/2008,341235,4352,4183.06
9/9/2008,341235,2152,4224.89
9/23/2008,341235,2152,4267.14
10/7/2008,341235,6534,4309.81
10/21/2008,341235,3543,4352.91
11/4/2008,341235,4352,4396.44
11/18/2008,341235,6534,4440.40
12/2/2008,341235,3543,4484.81
12/16/2008,341235,3543,4529.65
12/30/2008,341235,4352,4574.95
1/13/2009,341235,6534,4620.70
1/27/2009,341235,2152,4666.91
2/10/2009,341235,6534,4713.58
2/24/2009,341235,3543,4760.71
3/10/2009,341235,6534,4808.32
3/24/2009,341235,3543,4856.40
4/7/2009,341235,4352,4904.97
4/21/2009,341235,2152,4954.02
5/5/2009,341235,6534,5003.56
5/19/2009,341235,4352,5053.59
6/2/2009,341235,6534,5104.13
6/16/2009,341235,3543,5155.17
6/30/2009,341235,4352,5206.72
7/14/2009,341235,2152,5258.79
7/28/2009,341235,4352,5311.38
8/11/2009,341235,6534,5364.49
8/25/2009,341235,3543,5418.14
9/8/2009,341235,2152,5472.32
9/22/2009,341235,4352,5527.04
10/6/2009,341235,2152,5582.31
10/20/2009,341235,6534,5638.13
11/3/2009,341235,2152,5694.51
11/17/2009,341235,4352,5751.46
12/1/2009,341235,4352,5808.97
12/15/2009,341235,3543,5867.06
12/29/2009,341235,2152,5925.73
'''