-
Notifications
You must be signed in to change notification settings - Fork 0
/
proceeding.py
185 lines (131 loc) · 5.42 KB
/
proceeding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/bin/env python
"""
proceeding.py is responsible for adding new filings
for every proceeding we are interested in.
Since the remote system that does not keep track of what items we've seen,
there's a unique index on the filing.fcc_num column
to prevent multiple submissions for the same data.
As a result, the update task is inelegantly simple minded and brutish:
Search the FCC site for all proceedings we care about.
Grab all relevant filings and add them to the table.
Existing records will fail and new records will succeed.
The initial search can be run with the command:
python proceeding.py search
After the initial import, most runs will only add a few new records at a time,
if any.
It can be run once a day, perhap under cron, with the command
python proceeding.py rss
Note that the RSS feed data spans 30 days of records, so if the updates lapse
for more than 30 days, a search update must be run (once) to catch up before
restarting with the rss feeds.
The script uses the RAILS_ENV environment variable to determine run modes
so this should be set, either implicitly
rails runner "exec ..."
or explicitly
env RAILS_ENV=production ...
It defaults to development when unset.
This script can be run in test mode with the command
python proceeding.py test 12-375
which prints out the result of parsing the rss feed for proceeding 12-375.
Author: Gyepi Sam <self-github@gyepi.com>
"""
import re
import lxml
from utils import *
import db
import comment
def parse_proceeding_search(proceeding_num):
"""A generator that runs a search on FCC site and produces
urls for comments and documents relating to specified proceeding number"""
# In addition to search results, the first page also contains
# links to subsequent pages
# which will be subsequently followed.
try:
url = search_url(proceeding_num)
content = lxml.html.parse(url)
todo = [content] # addenda...
pages = [] # pages to follow and process
cache = {} # ensure unique set
for url in content.xpath('//a[contains(@href, "/ecfs/comment_search/paginate")]/@href'):
if not url in cache:
pages.append(hostify_url(url))
cache[url] = True
except Exception as e:
warn('url: ', url)
raise e
# Where there are more than 10 or so pages, the site lists the first N then the last page.
# For a set of page numbers like 1 2 3 4 5 6 7 N
# where N is the final page number, we interpolate from 7 + 1 to N - 1
# If there are no missing values, 7 + 1 would be greater than N - 1 and the range would be empty.
if len(pages) > 1:
lastpage = pages.pop()
lastpage_match = re.match('(.+pageNumber=)(\d+)', lastpage)
penultimate_match = re.match('(.+pageNumber=)(\d+)', pages[-1])
for number in range(int(penultimate_match.group(2)) + 1, int(lastpage_match.group(2)) - 1):
pages.append(lastpage_match.group(1) + str(number))
pages.append(lastpage)
todo.extend(pages)
for item in todo:
if hasattr(item, 'xpath'):
content = item
else:
try:
content = lxml.html.parse(item)
except Exception as e:
warn("Error fetching or parsing link", item, e)
continue
for href in content.xpath('//a[contains(@href, "/ecfs/comment/view")]/@href'):
yield hostify_url(clean_url(href))
def parse_proceeding_rss(proceeding_num):
"""Parse comment urls out of rss feed. This is faster and less resource
intensive and perfect for incremental updates"""
url = rss_url(proceeding_num)
content = lxml.etree.parse(url)
for href in content.xpath('/rss/channel/item/link/text()'):
yield href
def import_comments(proceeding_parser):
"""imports all proceeding comments into filing table and documents into
filing_docs table. proceeding_parser is either based on a search or an rss feed"""
conn = db.connection()
cur = conn.cursor()
cur.execute("SELECT id as proceeding_id, number FROM proceedings where status = 'Open'")
try:
proceedings = cur.fetchall()
except Exception, e:
warn("cannot fetch proceeding numbers", e)
raise
conn.commit()
for proceeding_id, number in proceedings:
for url in proceeding_parser(number):
comment.import_comment(proceeding_id, url)
conn.close()
def import_comments_search():
"""Imports comments based on searching for all comments. Useful for bulk
loading"""
import_comments(parse_proceeding_search)
def import_comments_rss():
"""Imports comments mentioned in RSS feed, which only covers recent items.
Useful for incremental updates."""
import_comments(parse_proceeding_rss)
if __name__ == "__main__":
import pprint
import sys
action = sys.argv[1]
dry_run = action == 'test'
if action in ('test', 'run'):
action = 'rss'
if len(sys.argv) > 2:
proceeding_number = sys.argv[2]
else:
proceeding_number = None
runner = globals().get("import_comments_" + action)
if runner:
if dry_run:
parser = globals()['parse_proceeding_' + action]
pp = pprint.PrettyPrinter(indent=4)
pp.pprint([x for x in parser(proceeding_number)])
else:
runner()
else:
warn("cannot understand task: " + action)
sys.exit(2)