pastebin - collaborative debugging

pastebin is a collaborative debugging tool allowing you to share and modify code snippets while chatting on IRC, IM or a message board.

This site is developed to XHTML and CSS2 W3C standards. If you see this paragraph, your browser does not support those standards and you need to upgrade. Visit WaSP for a variety of options.

financialpython private pastebin - collaborative debugging tool What's a private pastebin?


Posted by DK on Tue 15 Sep 07:06
report abuse | download | new post

  1. from BeautifulSoup import BeautifulSoup as BS
  2. from datetime import datetime
  3. from fnmatch import fnmatch
  4. import numpy as np
  5. import os
  6.  
  7.  
  8. def get_files(path):
  9.    '''
  10.   Get a list of all files in a given directory.
  11.  
  12.   Returns a list of filename strings.
  13.   '''
  14.    files = os.listdir(path)
  15.    return files
  16.  
  17. def file_subset(files, match_string):
  18.    '''
  19.   This function allows you to get a subset of the files in a directory.
  20.   Pass it a list of filenames and a string to match (e.g. '20090910*.txt')
  21.   '''
  22.    f_subset = [f for f in files if fnmatch(f, match_string)]
  23.    return f_subset
  24.  
  25.  
  26. def get_soup(filename):
  27.    '''
  28.   This function uses BeautifulSoup to parse the html of a given text file
  29.   and returns a dictionary:
  30.   '''
  31.    html = open(filename).read()
  32.    soup = BS(html)
  33.    tables = soup.findAll('table')
  34.    title = soup.find('h4').string
  35.    datestring = soup.find('caption').string
  36.    num_tables = len(tables)
  37.    table_no = filename.split('_')[1]
  38.    return {'soup': tables,
  39.            'num_tables':num_tables,
  40.            'title':title,
  41.            'datestring':datestring,
  42.            'table_no':table_no
  43.            }
  44.  
  45. def table2array(td_soup):
  46.    '''
  47.   This function takes a BeautifulSoup object consisting of <td> tags and
  48.   attempts to clean it up.
  49.  
  50.   The function returns 1) a tuple containing a numpy array that basically vector-
  51.   izes the html table contents and 2) a list that does the same.
  52.   '''
  53.    raw_data = [item.string for item in td_soup]
  54.    
  55.    clean_data = []
  56.    for item in raw_data:
  57.       #if statement below is an expedient way to cover all scenarios
  58.       if item in [None, u' ', '&nbsp;'] or item.string in [None,u' ', '&nbsp;']:
  59.          clean_data.append('NA')
  60.       else:
  61.          clean_data.append(item.strip())
  62.  
  63.    clean_data_array = np.array(clean_data)
  64.    return (clean_data_array, raw_data)
  65.  
  66. def reformat_array(data, table_no):
  67.    '''
  68.   This function is designed to take a tuple fro table2array. It takes a
  69.   "clean data" numpy array and a "raw data" python list as a tuple, and
  70.   the table number with which the data is associated.
  71.  
  72.   It returns a target array that mimics (approximately) the shape of the
  73.   source html table.
  74.   '''
  75.    clean_data_array, raw_data = data
  76.    data_mask = clean_data_array != 'NA'
  77.    data_index = np.where(data_mask)[0] #where() returns a tuple so need [0]
  78.    
  79.    raw_data_reversed = raw_data[:]
  80.    raw_data_reversed.reverse()
  81.    number_columns = raw_data_reversed.index(None)
  82.    total_columns = number_columns + 1
  83.    
  84.    if table_no in ['1', '9']:
  85.       first_col = data_index[1]
  86.       first_num = first_col + 1
  87.       temp_array = clean_data_array[first_num:]
  88.       NA_mask = temp_array == 'NA'
  89.       NA_index = np.where(NA_mask)[0]
  90.       target = np.delete(temp_array, NA_index)  
  91.       cols = number_columns
  92.    elif table_no in ['6','7','8','14','15','16','17','18','19']:
  93.       first_col = data_index[0]
  94.       target = clean_data_array[first_col:]
  95.       cols = total_columns
  96.    else:
  97.       first_col = data_index[1]
  98.       target = clean_data_array[first_col:]
  99.       cols = total_columns
  100.      
  101.    t_shape = (len(target)/cols, float(cols))
  102.    
  103.    try:
  104.       target.shape = t_shape
  105.       return target
  106.    except Exception, e:
  107.       return ('Table %s' % table_no, e)
  108.      
  109. def inspect_target(filename):
  110.    path = '/Volumes/Drobo/Data/DTCC/Raw/'
  111.    page = get_soup(path+filename)
  112.    soup = page['soup']
  113.    num_tables = page['num_tables']
  114.    title = page['title']
  115.    datestring = page['datestring']
  116.    table_no = page['table_no']
  117.    
  118.    for table in soup:
  119.       td_soup = table.findAll('td')
  120.       data = table2array(td_soup)
  121.       target = reformat_array(data, table_no)
  122.       print target
  123.    return data
  124.            
  125. def main():
  126.    path = '/Volumes/Drobo/Data/DTCC/Raw/'
  127.    all_files = get_files(path)
  128.    subset_files = file_subset(all_files, '20090910*.txt')
  129.    
  130.    errors = []
  131.    titles = []
  132.    target_dict = {}
  133.    for f in subset_files:
  134.       page = get_soup(path+f) #remember get_soup() returns a dictionary!
  135.       soup = page['soup']
  136.       num_tables = page['num_tables']
  137.       title = page['title']
  138.       datestring = page['datestring']
  139.       table_no = page['table_no']
  140.      
  141.       targets = []
  142.       for table in soup:
  143.          td_soup = table.findAll('td')
  144.          data = table2array(td_soup)
  145.          target = reformat_array(data, table_no)
  146.          if isinstance(target, tuple):
  147.             errors.append(target)
  148.          else:
  149.             targets.append(target)
  150.      
  151.       target_dict[table_no] = targets #most tables are list of 1!
  152.       titles.append(title)
  153.    target_dict['Titles'] = titles
  154.    
  155.    if not errors:
  156.       return target_dict
  157.    else:
  158.       return errors
  159.  
  160. if (__name__ == '__main__'):
  161.    targets = main()
  162.  
  163. '''
  164. To Do:
  165.  
  166. - delete TOTAL rows and columns
  167. - insert dates into table arrays
  168. - insert headers for csv once array has been modified
  169. - write to csv files
  170. '''

Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.

Syntax highlighting:

To highlight particular lines, prefix each line with @@


Remember me so that I can delete my post