####################################################################################
#                                                                                  
# Copyright (c) 2006 Jeffrey Richardson <jeff_richardsonATbyuDOTnet>               
#                                                                                  
# This deteclet is part of the Bid Rigging detectlets library.                     
# It requires a license and is not open source.                                    
#                                                                                  
####################################################################################
# UPDATES
# June 2006  Reviewed bug with Conan--the Detectlet wizard is having errors with floats
# 1 MAY 2006  First version of the detectlet
#
####################################################################################

#
#   STATUS: DEBUG
#   Picalo responds with an error upon launching the wizard:
#           exceptions.ValueError
#           empty string for float()
#     The log is not printing anything for me to review.  Talk to Conan and get the log printing again.

#
# How do I bring a Float?   THIS IS CAUSING THE ABOVE ERROR
#   -For the percent difference I should have a default value 
#
#
# IDEAS/QUESTIONS
#  Include a employee field?
#     -Discuss how employees who have multiple entries in the results may be corrupt
#
# Can I refer to Dollars or should I use something more international
#
####################################################################################

DETECTLET_STANDARD = 1.0

from picalo import *
wizard = '''
<wizard>
  <page>
    Please select which table contains the bidding data.
    It should look similar to the example input data (see
    the previous page), and it should have columns for
    the project, bid, vendor, and amount.
    
    If you need to join two or more tables together to get
    your data into this form, close the wizard and do so.
    <parameter type="Table" variable="table"/>
  </page>
  <page>
    Your input table should contain the records for many
    projects (these could also be called contracts).This 
    detectlet will stratify the table into one table for 
    each project, then run the analysis on each project table.
    
    Which column should the table be stratified by?  This column
    should uniquely identify each PROJECT/CONTRACT.  It might be 
    called "id", "project id", "contract id", or something similar.
    <parameter type="Column" table="table" variable="project_col"/>
  </page>
  <page>
    Each project will have some bids (one or two or several).  Which column
    specifies the BID ID?  This column should contain data
    similar to a "bid id" or "bid num" or "bid key.
    <parameter type="Column" table="table" variable="bid_col"/>
  </page>
  <page>
    Each bid will be made by a VENDOR or third party company.  Which
    column identifies the bidder?  This column might be named 
    "vendor_id", "bidder_id", "vendor", or "bidder".
    <parameter type="Column" table="table" variable="vendor_col"/>
  </page>
  <page>
    Which column contains the total bid amount?  This is is
    the total price of the entire bid and not the price of individual 
    line items.  It might be called "amount", "total", "bid total",
    "total amount", or "bid amount".
    <parameter type="Column" table="table" variable="amount_col"/>
  
    What is the limit (Dollar/Euro/Yen/etc...) for a project before it needs multiple bids?
    This is the threshold amount set by company policy.
    <parameter type="float" min="0" variable="threshold"/>
  </page>
</wizard>
'''
RESULTS_TEXT = '''\
    The displayed table shows all of the projects and bids that should have
    required multiple bids.  The table indicates the project (by ID), the bid 
    (by ID), the vendor (by ID) and the amount of the bid along with the
    amount that it exceeded the threshold.
    
    Vendors that are consistantly showing up on this table should be investigated.
    If the difference between the threshold amount and bid amount is 
    very large, sever abuse may be taking place.
    
    If a specific employee is approving these bids, he or she may be directing
    work towards a specific vendors.
'''

def run(table, project_col, bid_col, vendor_col, amount_col, threshold):
  '''Most organizations require multiple bids to increase competition between
     suppliers.  This keeps prices low and increases quality.  However, for 
     small projects, multiple bids are not necessary.  Organizations will fix
     a limit or threshold as to how much a project can cost before multiple
     bids are required.  
     
     Purchasers (who work for the organization) may want to 
     circumvent this control when they are getting kickbacks from a certain 
     vendor, have family or friends they want to funnel work to, or for 
     many other reasons.  The simplest way to circumvent this control is to
     simply ignore it.  
     
     This detectlet searches for projects that have only one bid and returns 
     those bids that cost more than the determined threshold/limit.
     
     The detectlet goes through the following process:
     - Stratifies the file into a table for each project.  
     - Identifies the projects that have only one bid.
     - Calculates the difference between the threshold and the bid amount.
     - Sorts the results by the project_id.
       
     Companies with a significant number of single bids breaking the limit
     may be corrupt.
     
     The example input table contains the following four columns:
     - Project: The project id
     - Vendor: The vendor id - there ID of the of the Company bidding.
     - Bid: The bid unique id
     - Amount: The total amount of each bid.
     
     In the example dataset several projects only have one bid.  If you set the
     threshold to 70,000 then you will find at least bid #18.
  '''
  
  # validate the data
  assert project_col != bid_col != vendor_col != amount_col, 'The Project, Bidder, Amount, and Vendor columns must be different.  Please ensure you haven\'t selected the same column for two of these items.'
  
  # run the analysis
  results = Table([
    ( 'project_id',       unicode ),
    ( 'bid_id',           unicode ),
    ( 'vendor',           unicode ),
    ( 'amount',           number ),
    ( 'threshold_excess', number ),
  ])
  projects = Grouping.stratify_by_value(table, project_col)
  for bids in projects:
    if len(bids) == 1: # search for all projects with only one bid
      if float(bids[0][amount_col]) > float(threshold): # compare the amount to the threshold
        rec = results.append() # append if the bid exceeds the threshold
        rec['project_id'] = bids[0][project_col]
        rec['bid_id'] = bids[0][bid_col]
        rec['vendor'] = bids[0][vendor_col]
        rec['amount'] = bids[0][amount_col]
        rec['threshold_excess'] = float(bids[0][amount_col]) - float(threshold)
  Simple.sort(results, False,'threshold_excess')
  return results, RESULTS_TEXT

  
def example_input():
  import StringIO  # to emulate a file for load_csv
  table = load_csv(StringIO.StringIO(csvdata))
  table.set_type('bid_id', int)
  table.set_type('project_id', int)
  table.set_type('vendor_id', int)
  table.set_type('bid_total_amount', number)
  return table
# This next part is not required, but it's easier to put the example data
# directly in this file so I don't have to worry about directories.
csvdata = '''\
bid_id,project_id,vendor_id,bid_total_amount
1,1,2,81710.7
2,1,3,73616.2
3,1,8,78459.6
4,2,4,80154.2
5,2,10,84468.5
6,2,13,84499.9
7,3,3,79047.6
8,3,9,78832.6
9,3,6,80644.6
10,4,13,74844.8
11,4,1,77039.7
12,4,14,74475.7
13,5,13,88636.9
14,5,14,91261.7
15,5,1,92006.5
18,6,13,74107.6
19,7,5,60250.7
20,7,15,64108.6
21,7,7,64087.9
22,8,2,57121.4
23,8,6,57733.0
24,8,3,57941.8
25,9,14,75812.8
26,9,10,77389.5
30,10,11,80934.1
31,11,8,64544.8
32,11,3,67933.3
33,11,6,63895.0
34,12,1,76622.9
35,12,4,77277.5
36,12,10,72907.8
37,13,9,100374.9
38,13,3,102772.8
39,13,8,104931.2
40,14,8,83470.3
41,14,6,85833.0
42,14,9,85366.0
43,15,7,111531.9
44,15,5,111454.7
47,16,9,90398.3
48,16,8,93963.1
49,17,7,69701.3
50,17,12,72258.4
51,17,15,77042.6
52,18,6,65980.7
53,18,2,62398.9
54,18,9,60761.1
55,19,1,53137.9
56,19,14,52581.8
57,19,4,51033.2
58,20,10,94882.4
59,20,14,96371.4
60,20,1,94926.1
61,21,14,82492.3
62,21,1,74023.4
66,22,15,94699.5
67,23,13,97517.0
68,23,4,93742.7
69,23,1,97177.3
70,24,11,98032.0
71,24,7,95292.9
72,24,15,95506.6
73,25,12,98206.1
74,25,7,99439.2
75,25,15,96456.8
76,26,5,79598.0
77,26,11,78413.6
78,26,15,76724.4
79,27,6,67505.6
80,27,3,68713.5
81,27,2,72646.5
82,28,11,72019.1
85,29,1,104470.7
88,30,5,69785.3
89,30,15,66948.1
90,30,11,68492.1
91,31,8,83345.3
92,31,3,83426.9
95,32,12,84316.8
96,32,15,90266.1
97,33,13,79932.4
98,33,14,73360.4
99,33,10,78470.1
'''
