-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_fidelity.py
196 lines (146 loc) · 7.62 KB
/
parse_fidelity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pdftotext
import os
import re
import constants
import csv
import datetime
"""
Luckily, all of the account value and withdrawal/deposit information is on the first page.
Unfortunately, the statements has some inconsistencies. Some statements have this near the top:
Envelope # BLRJWCBBCCJJS
$42.25
Change from Last Period:
Some statements have this section, followed by the #s associated with them
Beginning Account Value
Additions
Subtractions
Transaction Costs, Fees & Charges
Change in Investment Value *
Ending Account Value **
Accrued Interest (AI)
Ending Account Value Incl. AI
After some entries, there could be an asterisk.
The `Accrued Interest (AI)` and `Ending Account Value Incl. AI` are only on some statements.
There could be entries for `Additions` and `Subtractions`. If Subtractions is an entry, then there could be also an entry for `Transaction Costs, Fees & Charges`
The account value is from the entry for `Ending Account Value`
"""
#subtract 1 b/c PDF pages are 1-indexed
FIDELITY_ACCOUNT_VALUE_PAGE = 1 - 1
entries = {}
def bookkeep_month_entry(date, account_value, deposits, withdraws, account_num):
entries[(account_num, date)] = [deposits, withdraws, account_value]
def parse_statement(pdf_path):
# Load your PDF
with open(pdf_path, "rb") as f:
pdf = pdftotext.PDF(f)
txt = pdf[FIDELITY_ACCOUNT_VALUE_PAGE]
# get the 2nd line; ex: "February 1, 2022 - February 28, 2022"
date_builder = []
is_on_second_line = False
for i in range(100):
char = txt[i]
if char == "\n":
if is_on_second_line:
break
else:
is_on_second_line = True
if is_on_second_line:
date_builder.append(char)
raw_date = "".join(date_builder)
ending_date = raw_date.split("-")[1] #the latter half
formatted_date = datetime.datetime.strptime(ending_date, ' %B %d, %Y').strftime('%Y-%m')
regex = "Beginning Account Value\n(Additions\n)?(Subtractions\n)?(Transaction Costs, Fees & Charges\n)?Change in Investment Value[\* ]*\nEnding Account Value[\* ]*\n(Accrued Interest \(AI\)\n)?(Ending Account Value Incl. AI\n)?"
line_entries = re.findall(regex, txt)
# print(line_entries)
subtractions_entry = False
additions_entry = False
transactions_costs_fees_charges_entry = False
accured_interest_entry = False
ending_account_val_including_accured_interest_entry = False
if len(line_entries) == 0:
# the statement has a different format; try other regex
regex = "Beginning Account Value|Additions|Subtractions|Transaction Costs, Fees & Charges|Change in Investment Value|Ending Account Value|Accrued Interest \(AI\)|Ending Account Value Incl. AI"
line_entries = re.findall(regex, txt)
else:
line_entries = line_entries[0]
# determine if there is an entry for subtractions, additions, and transactions_costs_fees_charges
for entry in line_entries:
entry = entry.replace("\n", "")
if entry == "Additions":
additions_entry = True
elif entry == "Subtractions":
subtractions_entry = True
elif entry == "Transaction Costs, Fees & Charges":
# note, this debit is already included in subtractions
transactions_costs_fees_charges_entry = True
elif entry == "Accrued Interest (AI)":
accured_interest_entry = True
elif entry == "Ending Account Value Incl. AI":
ending_account_val_including_accured_interest_entry = True
# grab the #s associated with each entry
entry_data_regex = "This Period\n\nYear-to-Date\n\n([\s\S]*)$"
target_text = re.findall(entry_data_regex, txt)
if not target_text:
# if "This Period" text is found elsewhere
entry_data_regex = "Year-to-Date\n\n([\s\S]*)$"
target_text = re.findall(entry_data_regex, txt)
digits_regex = "\-?\$?([\d,]*\.\d\d)|\n-\n"
# prepending "\n" to the search text is because the digits_regex will not pick up the first "-" unless it's there
# however, this is an edge case because only on the first account statement (there, the Beginning Account value
# is going to be 0, so it's represented with a "-")
digits = re.findall(digits_regex, "\n" + target_text[0])
# print(digits)
next_digit_iterator = iter(digits)
begn_account_val_this_period = next(next_digit_iterator)
begn_account_val_ytd = next(next_digit_iterator)
additions_this_period, additions_ytd = 0, 0
if additions_entry:
additions_this_period = next(next_digit_iterator)
additions_ytd = next(next_digit_iterator)
subtractions_this_period, subtractions_ytd = 0, 0
if subtractions_entry:
subtractions_this_period = next(next_digit_iterator)
subtractions_ytd = next(next_digit_iterator)
if transactions_costs_fees_charges_entry:
transactions_costs_fees_charges_this_period = next(next_digit_iterator)
transactions_costs_fees_charges_ytd = next(next_digit_iterator)
change_in_investment_val_this_period = next(next_digit_iterator)
change_in_investment_val_ytd = next(next_digit_iterator)
ending_account_val_this_period = next(next_digit_iterator)
ending_account_val_ytd = next(next_digit_iterator)
# both of these should be true, but separating them just in case
accured_interest, ending_account_val_including_accured_interest = 0, 0
if accured_interest_entry:
accured_interest = next(next_digit_iterator)
if ending_account_val_including_accured_interest_entry:
ending_account_val_including_accured_interest = next(next_digit_iterator)
def verify_and_caste_to_float(num):
if num:
return float(num)
else:
return 0
# note: the regex did not take into account (+) or (-) nums; this is because their signs are all intuitive
beginning_account_val = verify_and_caste_to_float(begn_account_val_this_period)
account_val_change = verify_and_caste_to_float(change_in_investment_val_this_period)
contributions = verify_and_caste_to_float(additions_this_period)
withdrawals = verify_and_caste_to_float(subtractions_this_period) * -1
account_val = verify_and_caste_to_float(ending_account_val_this_period)
added_together = beginning_account_val + account_val_change + contributions + withdrawals
if round(added_together, 2) == account_val:
print("check")
else:
added_together = beginning_account_val - account_val_change + contributions + withdrawals
if round(added_together, 2) == account_val:
print("check 2")
else:
print("didn't add up")
# back out transaction costs, fees, and charges from withdrawals
# since withdrawals is a (-) num, we add
if transactions_costs_fees_charges_entry:
withdrawals += verify_and_caste_to_float(transactions_costs_fees_charges_this_period)
account_num_regex = "Account Number: ([A-Z0-9]*-[A-Z0-9]*)"
account_num = re.findall(account_num_regex, txt)[0]
print(account_num)
key = (account_num, formatted_date) #tuple so it can be the key in a dict
payload = [contributions, withdrawals, account_val]
return (key, payload)