Posted by DK on Tue 15 Sep 07:06
report abuse | download | new post
- from BeautifulSoup import BeautifulSoup as BS
- from datetime import datetime
- from fnmatch import fnmatch
- import numpy as np
- import os
- def get_files(path):
- '''
- Get a list of all files in a given directory.
- Returns a list of filename strings.
- '''
- files = os.listdir(path)
- return files
- def file_subset(files, match_string):
- '''
- This function allows you to get a subset of the files in a directory.
- Pass it a list of filenames and a string to match (e.g. '20090910*.txt')
- '''
- f_subset = [f for f in files if fnmatch(f, match_string)]
- return f_subset
- def get_soup(filename):
- '''
- This function uses BeautifulSoup to parse the html of a given text file
- and returns a dictionary:
- '''
- html = open(filename).read()
- soup = BS(html)
- tables = soup.findAll('table')
- title = soup.find('h4').string
- datestring = soup.find('caption').string
- num_tables = len(tables)
- table_no = filename.split('_')[1]
- return {'soup': tables,
- 'num_tables':num_tables,
- 'title':title,
- 'datestring':datestring,
- 'table_no':table_no
- }
- def table2array(td_soup):
- '''
- This function takes a BeautifulSoup object consisting of <td> tags and
- attempts to clean it up.
- The function returns 1) a tuple containing a numpy array that basically vector-
- izes the html table contents and 2) a list that does the same.
- '''
- raw_data = [item.string for item in td_soup]
- clean_data = []
- for item in raw_data:
- #if statement below is an expedient way to cover all scenarios
- if item in [None, u' ', ' '] or item.string in [None,u' ', ' ']:
- clean_data.append('NA')
- else:
- clean_data.append(item.strip())
- clean_data_array = np.array(clean_data)
- return (clean_data_array, raw_data)
- def reformat_array(data, table_no):
- '''
- This function is designed to take a tuple fro table2array. It takes a
- "clean data" numpy array and a "raw data" python list as a tuple, and
- the table number with which the data is associated.
- It returns a target array that mimics (approximately) the shape of the
- source html table.
- '''
- clean_data_array, raw_data = data
- data_mask = clean_data_array != 'NA'
- data_index = np.where(data_mask)[0] #where() returns a tuple so need [0]
- raw_data_reversed = raw_data[:]
- raw_data_reversed.reverse()
- number_columns = raw_data_reversed.index(None)
- total_columns = number_columns + 1
- if table_no in ['1', '9']:
- first_col = data_index[1]
- first_num = first_col + 1
- temp_array = clean_data_array[first_num:]
- NA_mask = temp_array == 'NA'
- NA_index = np.where(NA_mask)[0]
- target = np.delete(temp_array, NA_index)
- cols = number_columns
- elif table_no in ['6','7','8','14','15','16','17','18','19']:
- first_col = data_index[0]
- target = clean_data_array[first_col:]
- cols = total_columns
- else:
- first_col = data_index[1]
- target = clean_data_array[first_col:]
- cols = total_columns
- t_shape = (len(target)/cols, float(cols))
- try:
- target.shape = t_shape
- return target
- except Exception, e:
- return ('Table %s' % table_no, e)
- def inspect_target(filename):
- path = '/Volumes/Drobo/Data/DTCC/Raw/'
- page = get_soup(path+filename)
- soup = page['soup']
- num_tables = page['num_tables']
- title = page['title']
- datestring = page['datestring']
- table_no = page['table_no']
- for table in soup:
- td_soup = table.findAll('td')
- data = table2array(td_soup)
- target = reformat_array(data, table_no)
- print target
- return data
- def main():
- path = '/Volumes/Drobo/Data/DTCC/Raw/'
- all_files = get_files(path)
- subset_files = file_subset(all_files, '20090910*.txt')
- errors = []
- titles = []
- target_dict = {}
- for f in subset_files:
- page = get_soup(path+f) #remember get_soup() returns a dictionary!
- soup = page['soup']
- num_tables = page['num_tables']
- title = page['title']
- datestring = page['datestring']
- table_no = page['table_no']
- targets = []
- for table in soup:
- td_soup = table.findAll('td')
- data = table2array(td_soup)
- target = reformat_array(data, table_no)
- if isinstance(target, tuple):
- errors.append(target)
- else:
- targets.append(target)
- target_dict[table_no] = targets #most tables are list of 1!
- titles.append(title)
- target_dict['Titles'] = titles
- if not errors:
- return target_dict
- else:
- return errors
- if (__name__ == '__main__'):
- targets = main()
- '''
- To Do:
- - delete TOTAL rows and columns
- - insert dates into table arrays
- - insert headers for csv once array has been modified
- - write to csv files
- '''
Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.