#!/usr/bin/python

from picalo import *
import email, mailbox, os, os.path, types

DETECTLET_STANDARD = 1.0

wizard = '''
<wizard>
  <page>
    How is the email formatted?  
    
    Mbox format, arguably the most common format in
    the world, places a number of emails in a single file, resulting in one file
    per mail folder.  If your email is in this format, specify the filename here:

    <parameter type="filename" variable="filename"/>

    Maildir format instead uses one file per email, resulting in 
    an entire directory of files per mail folder.  The Maildir directory should contain
    a set of maildir-style files, with one email per file.  If your email is in
    this format, specify the directory holding the email files here:
    
    <parameter type="dirname" variable="dirname"/>
  </page>
</wizard>
'''

RESULTS_TEXT = '''\
<html><body>
    The displayed table contains one record for each email.  Most of today's emails
    contain two parts: a text-based message and an html-formatted message.  Email clients
    normally display the html version of the email (since it can contain fonts, colors, and
    other formatting options).  The text version is often included for those email clients
    that cannot display html.  The table contains one column for the text version and one for
    the html version.  If a given email includes only one format, only that column contains 
    a value.
    <p>
    If you want to filter the emails based on text in a given column, use the "in" keyword. 
    For example, setting the following filter:
    <p>
    &nbsp;&nbsp;&nbsp;&nbsp;"Parley's" in Subject
    <p>
    will display only records with the word Parley's in the subject line.  Picalo's regular
    expressions can also be used for powerful pattern matching in a similar fashion.
</body></html>
'''



def run(filename=None, dirname=None):
  '''Email can be one of the most useful sources of information.  
     Once you've imported an email folder, you can use Picalo's built-in features
     to search for keywords, phrases, or communication patterns.  For example, suppose
     you want to show all emails that have the word "Parley's" in the subject line.
     The following filter will show only those records:
     
     "Parley's" in Subject
     
     This detectlet imports a number of standard email formats, including Maildir and mbox.  
     These formats represent the most popular formats used today.  The detectlet does not directly read 
     Outlook's .pst format, Notes .nsf format, or the Groupwise format because these formats
     are proprietary.  If you need to read these formats, search the web for a converter program.
     For example, many .pst -> mbox format converters exist on the Internet.  Once the file
     has been converted to mbox, run this detectlet again.
     
     @param filename:                      The file to load if loading mbox format, or None if not using mbox.
     @type  filename:                      str
     @param dirname:                       The directory to load if loading Maildir format, or None if not using Maildir.
     @type  dirname:                       str
     @return:                              A new table containing one record per email message.
     @rtype:                               Table
  '''
  # validate the variables
  if filename:
    assert os.path.isfile(filename), 'Please select a valid mbox file.'
    mbox = mailbox.mbox(filename)
    show_progress("Reading emails...")
    messages = mbox.values()
    clear_progress()
  elif dirname:
    assert os.path.isdir(dirname), 'Please select a valid directory holding the Maildir files.'
    messages = []
    files = os.listdir(dirname)
    for i, fname in enumerate(files):
      show_progress("Reading emails...", float(i) / float(len(files)))
      f = open(os.path.join(dirname, fname))
      messages.append(email.message_from_file(f))
      f.close()
    clear_progress()
  else:
    raise AssertionError, 'Please specify either a filename for mbox format or a directory name for Maildir format.'
  
  # create the results table
  results = Table([
    ('MsgId', unicode),
    ('InReplyTo', unicode),
    ('MsgFrom', unicode),
    ('MsgTo', unicode),
    ('MsgCc', unicode),
    ('MsgSubject', unicode),
    ('MsgDate', Date),
    ('TextBody', unicode),
    ('HtmlBody', unicode),
  ])
  
  # import the email using the email library
  for i, msg in enumerate(messages):
    show_progress("Parsing emails...", float(i) / float(len(messages)))
    rec = results.append()
    rec['MsgId'] = msg['Message-ID']
    rec['InReplyTo'] = msg['In-Reply-To']
    rec['MsgFrom'] = msg['From']
    rec['MsgTo'] = msg['To']
    rec['MsgCc'] = msg['Cc']
    rec['MsgSubject'] = msg['Subject']
    rec['MsgDate'] = msg['Date']
    if msg.is_multipart():
      for part in msg.walk():  # take the first html or text sections
        if rec['TextBody'] == None and part.get_content_type() == 'text/plain':
          rec['TextBody'] = part.get_payload()
        elif rec['HtmlBody'] == None and part.get_content_type() == 'text/html':
          rec['HtmlBody'] = part.get_payload()
    else:
      if msg.get_content_type() == 'text/plain':
        rec['TextBody'] = msg.get_payload()
      elif msg.get_content_type() == 'text/html':
        rec['HtmlBody'] = msg.get_payload()
  clear_progress()      
    
  # return the results
  return results, RESULTS_TEXT
  

# for testing
if __name__ == '__main__':
#  results, text = run('email', 'maildir')
  results, text = run('testmbox.eml', 'mbox')
  results.view()
  
  