From 861a12b6ed07a085155d46d5880e74a204201638 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sim=C3=B5es?= Date: Tue, 26 Jan 2016 17:36:22 -0600 Subject: [PATCH 1/5] .gitignore: +.csv and config attended/SL/config.example.py: new file --- .gitignore | 2 ++ attended/SL/config.example.py | 5 +++++ 2 files changed, 7 insertions(+) create mode 100644 attended/SL/config.example.py diff --git a/.gitignore b/.gitignore index c9b568f..e2bbb56 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.pyc *.swp +*.csv +attended/SL/config.py diff --git a/attended/SL/config.example.py b/attended/SL/config.example.py new file mode 100644 index 0000000..732ac2d --- /dev/null +++ b/attended/SL/config.example.py @@ -0,0 +1,5 @@ +import os + +# Copy this file to config.py and edit the line below. + +writePath = os.getenv('DROPBOX') + '/noBIP/gbOutput/' From 7255a30ce13b3e5178d94ca28536673e49b7e065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sim=C3=B5es?= Date: Tue, 26 Jan 2016 17:53:53 -0600 Subject: [PATCH 2/5] attended/SL/TXLeg.py: substantial rewrite --- attended/SL/TXLeg.py | 228 ++++++++++++++++++++++++++++++++----------- 1 file changed, 173 insertions(+), 55 deletions(-) diff --git a/attended/SL/TXLeg.py b/attended/SL/TXLeg.py index 5267231..7bb8ccd 100644 --- a/attended/SL/TXLeg.py +++ b/attended/SL/TXLeg.py @@ -1,62 +1,180 @@ +#!/usr/bin/env python + from bs4 import BeautifulSoup -from csv import DictWriter from config import writePath -import urllib2 +from unidecode import unidecode +from urllib2 import urlopen +import csv +import os.path import re -def getTXRep(url, partyDict, body): +def get_tx_rep(url, body): + + print 'Fetching ' + url + ' ...' + + # In case of connection failure: while True: - print url try: - response = urllib2.urlopen(url) - soup = BeautifulSoup(response.read(), 'lxml') - distSpan = soup.find('span', {'id': 'lblDistrict'}) - district = '' - name = '' - phone = '' - address = '' - if distSpan is not None: - district = 'TX State {0} District {1}'.format(body, distSpan.get_text().strip()) - name = re.sub(r'^.*(Rep\.|Sen\.)', '', soup.find('title').string.strip()).strip().replace(u'\u00A0', ' ').replace(' ', ' ').replace(' ', ' ').replace(u'\u0144', 'n').replace(u'\u00f1', 'n').replace(u'\u2018', "'").replace(u'\u2019', "'").replace(u'\u201A', "'").replace(u'\u201B', "'").replace(u'\u2039', "'").replace(u'\u203A', "'").replace(u'\u201C', '"').replace(u'\u201D', '"').replace(u'\u201E', '"').replace(u'\u201F', '"').replace(u'\u00AB', '"').replace(u'\u00BB', '"').replace(u'\u00e0', 'a').replace(u'\u00e1', 'a').replace(u'\u00e8', 'e').replace(u'\u00e9', 'e').replace(u'\u00ec', 'i').replace(u'\u00ed', 'i').replace(u'\u00f2', 'o').replace(u'\u00f3', 'o').replace(u'\u00f9', 'u').replace(u'\u00fa', 'u') - phone = soup.find('span', {'id': 'lblCapitolPhone'}).get_text().strip() - address = '{0} {1}'.format(soup.find('span', {'id': 'lblCapitolAddress1'}).get_text().strip(), soup.find('span', {'id': 'lblCapitolAddress2'}).get_text().strip()) - return district, name, phone, address - except Exception: - pass - - -def getTXLeg(partyDict): - houseSoup = BeautifulSoup(urllib2.urlopen('http://www.capitol.state.tx.us/Members/Members.aspx?Chamber=H').read(), 'lxml') - senateSoup = BeautifulSoup(urllib2.urlopen('http://www.capitol.state.tx.us/Members/Members.aspx?Chamber=S').read(), 'lxml') - houseTable = houseSoup.find('table', {'id': 'dataListMembers'}).find_all('td') - senateTable = senateSoup.find('table', {'id': 'dataListMembers'}).find_all('td') - dictList = [] - - for item in houseTable: - repInfo = {} - link = item.find('a') - if link is not None: - repInfo['Website'] = 'http://www.capitol.state.tx.us/Members/' + link.get('href') - repInfo['District'], repInfo['Name'], repInfo['Phone'], repInfo['Address'] = getTXRep(repInfo['Website'], partyDict, 'House') - dictList.append(repInfo) - - for item in senateTable: - repInfo = {} - link = item.find('a') - if link is not None: - repInfo['Website'] = 'http://www.capitol.state.tx.us/Members/' + link.get('href') - repInfo['District'], repInfo['Name'], repInfo['Phone'], repInfo['Address'] = getTXRep(repInfo['Website'], partyDict, 'Senate') - dictList.append(repInfo) - - return dictList - - -if __name__ == "__main__": - partyDict = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'} - dictList = getTXLeg(partyDict) - with open(writePath + 'TXLeg.csv', 'w') as csvFile: - dwObject = DictWriter(csvFile, ['District', 'Name', 'Party', 'Website', 'Phone', 'Address', 'Email', 'Facebook', 'Twitter'], restval='') - dwObject.writeheader() - for row in dictList: - dwObject.writerow(row) + response = urlopen(url) + break + except: + continue + + soup = BeautifulSoup(response.read(), 'lxml') + + return { + 'house': get_house_rep, + 'senate': get_senator + }[body](soup) + + +# Strip HTML tags, leading and trailing spaces on each line, redundant spacing: +def thorough_strip(string): + string = re.sub(r'\<.+?>', '', string) + string = re.sub(r'[ \t]+', ' ', string) + string = re.sub(r'^[ \t]+|[ \t]+$', '', string, flags=re.MULTILINE) + string = re.sub('[\n\r]+', '\n', string) + return string + + +def get_house_rep(soup): + member_info = soup.find('div', {'class': 'member-info'}) + + number = re.search( + r'District (\d+)', str(member_info) + ).group(1) + district = 'TX State House District %s' % number + + # TX House member names are in "Last, First" format: + def rewrite_name(string): + search = re.search('Rep. (.+?)(?:, (?!Jr.))(.+)', string) + if search is None: + return None + + first, last = search.group(2).strip(), search.group(1).strip() + return unidecode(first + ' ' + last).strip() + + name = rewrite_name(member_info.find('h2').get_text()) + + phone = re.search( + r'\([0-9]{3}\)\s[0-9]{3}-[0-9]{4}', + str(member_info) + ).group() + + address = thorough_strip( + re.search( + r'Capitol Address:(.+?787\d{2})', + str(member_info), + re.DOTALL + ).group(1) + ) + + return { + 'District': district, + 'Name': name, + 'Phone': phone, + 'Address': address + } + + +def get_senator(soup): + memtitle = soup.find('div', {'class': 'memtitle'}) + + number = re.search(r'District (\d+)', memtitle.string).group(1) + district = 'TX State Senate District %s' % number + + name = unidecode( + re.search(r'Senator (.+):', memtitle.string).group(1).strip() + ) + + memoffice = re.sub( + r'<.+?>', + '\n', + str(soup.find('td', {'class': 'memoffice'})) + ).strip() + + search = re.search( + r'(The Honorable.+787\d{2}).*(\(\d{3}\).+\d{3}-\d{4})', + memoffice, + re.DOTALL + ) + + address = thorough_strip(search.group(1)) + + phone = search.group(2).strip() + + return { + 'District': district, + 'Name': name, + 'Phone': phone, + 'Address': address + } + + +# Start with the state-provided directories of members and then go to each +# member's page: +def get_tx_leg(): + + base_urls = { + 'house': 'http://www.house.state.tx.us', + 'senate': 'http://www.senate.state.tx.us/75r/Senate/' + } + tables = { + 'house': BeautifulSoup( + urlopen('http://www.house.state.tx.us/members').read(), + 'lxml' + ).find( + 'table', {'cellspacing': '10'} + ).find_all('td'), + + 'senate': BeautifulSoup( + urlopen( + 'http://www.senate.state.tx.us/75r/Senate/Members.htm' + ).read(), + 'lxml' + ).find( + 'table', {'summary': '3 column layout of List of senators by name'} + ).find_all('li') + } + + dict_list = [] + + for body in ('house', 'senate'): + for item in tables[body]: + rep_info = {} + link = item.find('a') + + if link is None: + continue + + url = base_urls[body] + link.get('href') + rep_info = {'Website': url} + rep_info.update(get_tx_rep(url, body)) + + # Skip entries with None values: + if len(filter(lambda val: val is None, rep_info.values())) > 0: + continue + + print str(rep_info) + '\n' + + dict_list.append(rep_info) + + return dict_list + + +if __name__ == '__main__': + dict_list = get_tx_leg() + with open(os.path.join(writePath, 'TXLeg.csv'), 'w') as csv_file: + csv = csv.DictWriter( + csv_file, + [ + 'District', 'Name', 'Party', 'Website', 'Phone', + 'Address', 'Email', 'Facebook', 'Twitter' + ], + restval='', + lineterminator='\n' + ) + csv.writeheader() + for row in dict_list: + csv.writerow(row) From ce509759457fc72f72f91171ad78204d58b310e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sim=C3=B5es?= Date: Wed, 27 Jan 2016 10:33:27 -0600 Subject: [PATCH 3/5] stop tracking .gitignored file --- attended/SL/config.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 attended/SL/config.py diff --git a/attended/SL/config.py b/attended/SL/config.py deleted file mode 100644 index 098a5ad..0000000 --- a/attended/SL/config.py +++ /dev/null @@ -1,3 +0,0 @@ -import os - -writePath = os.getenv('DROPBOX') + '/noBIP/gbOutput/' From f42c3a42ff002bd5527d02d10bb59d060da0eb9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sim=C3=B5es?= Date: Wed, 27 Jan 2016 10:51:26 -0600 Subject: [PATCH 4/5] move multiline_strip to function library --- attended/SL/TXLeg.py | 15 ++++++++++----- lib/govbot/__init__.py | 0 lib/govbot/util.py | 8 ++++++++ 3 files changed, 18 insertions(+), 5 deletions(-) create mode 100644 lib/govbot/__init__.py create mode 100644 lib/govbot/util.py diff --git a/attended/SL/TXLeg.py b/attended/SL/TXLeg.py index 7bb8ccd..c9ce55d 100644 --- a/attended/SL/TXLeg.py +++ b/attended/SL/TXLeg.py @@ -7,7 +7,12 @@ import csv import os.path import re +import sys +sys.path.append( + os.path.join(sys.path[0], '../../lib') +) +from govbot.util import multiline_strip def get_tx_rep(url, body): @@ -30,10 +35,10 @@ def get_tx_rep(url, body): # Strip HTML tags, leading and trailing spaces on each line, redundant spacing: -def thorough_strip(string): +def multiline_strip(string): string = re.sub(r'\<.+?>', '', string) string = re.sub(r'[ \t]+', ' ', string) - string = re.sub(r'^[ \t]+|[ \t]+$', '', string, flags=re.MULTILINE) + string = re.sub(r'^\s+|\s+$', '', string, flags=re.MULTILINE) string = re.sub('[\n\r]+', '\n', string) return string @@ -62,7 +67,7 @@ def rewrite_name(string): str(member_info) ).group() - address = thorough_strip( + address = multiline_strip( re.search( r'Capitol Address:(.+?787\d{2})', str(member_info), @@ -169,8 +174,8 @@ def get_tx_leg(): csv = csv.DictWriter( csv_file, [ - 'District', 'Name', 'Party', 'Website', 'Phone', - 'Address', 'Email', 'Facebook', 'Twitter' + 'District', 'Name', 'Party', 'Website', 'Phone', 'Address', + 'Email', 'Facebook', 'Twitter' ], restval='', lineterminator='\n' diff --git a/lib/govbot/__init__.py b/lib/govbot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/govbot/util.py b/lib/govbot/util.py new file mode 100644 index 0000000..3ec6a22 --- /dev/null +++ b/lib/govbot/util.py @@ -0,0 +1,8 @@ +import re + +# Strip leading and trailing spaces on each line, redundant spacing: +def multiline_strip(string): + string = re.sub(r'[ \t]+', ' ', string) + string = re.sub(r'^\s+|\s+$', '', string, flags=re.MULTILINE) + string = re.sub('[\n\r]+', '\n', string) + return string From 10c4cd13057b5179314a502dee0c87cf468f8acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sim=C3=B5es?= Date: Wed, 27 Jan 2016 10:51:58 -0600 Subject: [PATCH 5/5] attended/SL/AKLeg.py: Update to reflect website changes --- attended/SL/AKLeg.py | 113 ++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 44 deletions(-) diff --git a/attended/SL/AKLeg.py b/attended/SL/AKLeg.py index ba417a0..568a9b2 100644 --- a/attended/SL/AKLeg.py +++ b/attended/SL/AKLeg.py @@ -1,64 +1,89 @@ +#!/usr/bin/env python + from bs4 import BeautifulSoup from csv import DictWriter from config import writePath -import urllib2 +from urllib2 import urlopen +from unidecode import unidecode +import os.path import re +import sys +sys.path.append( + os.path.join(sys.path[0], '../../lib') +) +from govbot.util import multiline_strip -def getAKrep(url): - while True: - print url - try: - response = urllib2.urlopen(url, timeout=10) - soup = BeautifulSoup(response.read()).find('div', {'id': 'fullpage'}) - district = re.sub(r'^.*District: ([0-9A-Za-z]*).*$', r'\1', soup.get_text().replace('\n', ' ')) - party = re.sub(r'^.*Party: ([0-9A-Za-z]*).*$', r'\1', soup.get_text().replace('\n', ' ')) - email = '' - tempEmail = soup.find('a', {'href': re.compile('mailto')}) - if tempEmail is not None: - email = re.sub('[Mm][Aa][Ii][Ll][Tt][Oo]:', '', tempEmail.get('href')) - return district, party, email - except Exception: - pass +def getAKLeg(): + house, senate = map( + lambda body: BeautifulSoup( + urlopen('http://house.legis.state.ak.us/').read() + ).find( + 'div', {'id': 'tab1-2'} + ).find( + 'ul', {'class': 'people-holder'} + ).find( + 'ul', {'class': 'item'} + ).find_all('li'), + ('house', 'senate') + ) + dictList = [] -def getAKLeg(partyDict): - houseSoup = BeautifulSoup(urllib2.urlopen('http://house.legis.state.ak.us/').read()) - senateSoup = BeautifulSoup(urllib2.urlopen('http://senate.legis.state.ak.us/').read()) + for body, table in zip(('House', 'Senate'), (house, senate)): + for item in table: + repInfo = {} + repInfo['Name'] = unidecode( + item.find('strong', {'class': 'name'}).string + ).strip() - houseTable = houseSoup.find('div', {'id': 'legislators'}).find_all('div', {'class': 'leg_float'}) - senateTable = senateSoup.find('div', {'id': 'legislators'}).find_all('div', {'class': 'leg_float'}) + link = item.find('a') + repInfo['Website'] = link.get('href') - dictList = [] + dl = item.find('dl') + district = re.search( + r'District:\s*(\w+)', dl.get_text(), re.DOTALL + ).group(1) + repInfo['District'] = 'AK State {0} District {1}'.format( + body, district + ) + + repInfo['Party'] = re.search( + r'Party:\s*(\w+)', dl.get_text(), re.DOTALL + ).group(1) + + repInfo['Phone'] = re.search( + r'Phone:\s*([0-9-]+)', dl.get_text(), re.DOTALL + ).group(1) - for item in houseTable: - repInfo = {} - link = item.find('a') - repInfo['Name'] = link.string.strip().replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(u'\u00f1', 'n').replace(u'\u2018', "'").replace(u'\u2019', "'").replace(u'\u201A', "'").replace(u'\u201B', "'").replace(u'\u2039', "'").replace(u'\u203A', "'").replace(u'\u201C', '"').replace(u'\u201D', '"').replace(u'\u201E', '"').replace(u'\u201F', '"').replace(u'\u00AB', '"').replace(u'\u00BB', '"').replace(u'\u00e0', 'a').replace(u'\u00e1', 'a').replace(u'\u00e8', 'e').replace(u'\u00e9', 'e').replace(u'\u00ec', 'i').replace(u'\u00ed', 'i').replace(u'\u00f2', 'o').replace(u'\u00f3', 'o').replace(u'\u00f9', 'u').replace(u'\u00fa', 'u') - repInfo['Website'] = link.get('href') - tempdist, tempparty, repInfo['Email'] = getAKrep(repInfo['Website']) - repInfo['District'] = 'AK State House District ' + tempdist - repInfo['Party'] = partyDict[str(tempparty)] - dictList.append(repInfo) + repInfo['Email'] = dl.find('a').get('href').replace('mailto:', '') - for item in senateTable: - repInfo = {} - link = item.find('a') - repInfo['Name'] = link.string.strip().replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(u'\u00f1', 'n').replace(u'\u2018', "'").replace(u'\u2019', "'").replace(u'\u201A', "'").replace(u'\u201B', "'").replace(u'\u2039', "'").replace(u'\u203A', "'").replace(u'\u201C', '"').replace(u'\u201D', '"').replace(u'\u201E', '"').replace(u'\u201F', '"').replace(u'\u00AB', '"').replace(u'\u00BB', '"').replace(u'\u00e0', 'a').replace(u'\u00e1', 'a').replace(u'\u00e8', 'e').replace(u'\u00e9', 'e').replace(u'\u00ec', 'i').replace(u'\u00ed', 'i').replace(u'\u00f2', 'o').replace(u'\u00f3', 'o').replace(u'\u00f9', 'u').replace(u'\u00fa', 'u') - repInfo['Website'] = link.get('href') - tempdist, tempparty, repInfo['Email'] = getAKrep(repInfo['Website']) - repInfo['District'] = 'AK State Senate District ' + tempdist - repInfo['Party'] = partyDict[str(tempparty)] - dictList.append(repInfo) + member_soup = BeautifulSoup(urlopen(repInfo['Website']).read()) + repInfo['Address'] = multiline_strip( + re.search( + r'Session Contact(.+99801)', + member_soup.find_all('div', {'class': 'bioleft'})[1].get_text(), + re.DOTALL + ).group(1) + ) + print str(repInfo) + '\n' + dictList.append(repInfo) return dictList if __name__ == '__main__': - partyDict = {'(R)': 'Republican', '(D)': 'Democratic', '(I)': 'Independent', 'R': 'Republican', 'D': 'Democratic', '': 'Unknown', 'I': 'Independent', 'Democrat': 'Democratic', 'Republican': 'Republican', 'Democratic': 'Democratic', 'Independent': 'Independent'} - dictList = getAKLeg(partyDict) - with open(writePath + 'AKLeg.csv', 'w') as csvFile: - dwObject = DictWriter(csvFile, ['District', 'Name', 'Party', 'Website', 'Email', 'Phone', 'Address'], restval='') + dictList = getAKLeg() + with open(os.path.join(writePath, 'AKLeg.csv'), 'w') as csvFile: + dwObject = DictWriter( + csvFile, + [ + 'District', 'Name', 'Party', 'Website', 'Email', 'Phone', + 'Address' + ], + restval='', + lineterminator='\n' + ) dwObject.writeheader() for row in dictList: dwObject.writerow(row)