Skip to content

Commit

Permalink
Merge pull request #575 from sbenthall/i569
Browse files Browse the repository at this point in the history
Moves config.py and config.yaml into bigbang dir. Addresses #569
  • Loading branch information
sbenthall authored Nov 10, 2022
2 parents dede162 + bda97db commit cb3f052
Show file tree
Hide file tree
Showing 31 changed files with 145 additions and 134 deletions.
2 changes: 1 addition & 1 deletion bigbang/analysis/listserv.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import yaml
from bs4 import BeautifulSoup

from config.config import CONFIG
from bigbang.config import CONFIG

import bigbang.bigbang_io as bio
from bigbang.utils import (
Expand Down
2 changes: 1 addition & 1 deletion bigbang/analysis/repo_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from nbconvert import PythonExporter
import nbformat

from config.config import CONFIG
from bigbang.config import CONFIG

from bigbang.ingress.git_repo import GitRepo, MultiGitRepo

Expand Down
2 changes: 1 addition & 1 deletion bigbang/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pathlib import Path
import numpy as np
import pandas as pd
from config.config import CONFIG
from bigbang.config import CONFIG

filepath_auth = CONFIG.config_path + "authentication.yaml"
directory_project = str(Path(os.path.abspath(__file__)).parent.parent)
Expand Down
2 changes: 1 addition & 1 deletion bigbang/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from bigbang.parse import get_date, get_text
import bigbang.analysis.process as process
from bigbang.analysis.thread import Node, Thread
from config.config import CONFIG
from bigbang.config import CONFIG

from . import utils

Expand Down
6 changes: 3 additions & 3 deletions bigbang/bigbang_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pathlib import Path
import numpy as np
import pandas as pd
from config.config import CONFIG
from bigbang.config import CONFIG

from bigbang.analysis import utils
from bigbang.data_types import Message, MailList, MailListDomain
Expand Down Expand Up @@ -196,7 +196,7 @@ def mlist_to_mbox(
msgs: MailList,
dir_out: str,
filename: str,
mode: Optional[str]='w',
mode: Optional[str] = "w",
) -> None:
"""
Saves a List[mailbox.mboxMessage] as .mbox file.
Expand All @@ -213,7 +213,7 @@ def mlist_to_mbox(
# create filepath
filepath = f"{dir_out}/{filename}.mbox"
# delete file if there is one at the filepath
if Path(filepath).is_file() and mode == 'w':
if Path(filepath).is_file() and mode == "w":
Path(filepath).unlink()
mbox = mailbox.mbox(filepath)
mbox.lock()
Expand Down
27 changes: 27 additions & 0 deletions bigbang/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import yaml
import os

bigbang_path = os.path.dirname(os.path.realpath(__file__))
base_loc = os.path.abspath(
os.path.join(bigbang_path, os.pardir)
) # parent directory of config directory
config_filepath = os.path.join(base_loc, "bigbang", "config.yml")
stream = open(config_filepath, "r")
dictionary = yaml.safe_load(stream)


class Config(object):
def __init__(self, conf):
self.CONFIG = conf

def __getattr__(self, query):
if query in self.CONFIG:
ans = self.CONFIG[query]
if "path" in query:
ans = os.path.join(base_loc, ans)
return ans
else:
return None


CONFIG = Config(dictionary)
2 changes: 1 addition & 1 deletion config/config.yml → bigbang/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# For Configuration to work properly, paths need to have "path"
# in their keyword.
# NOTE: BE CAFEFUL NOT TO PUT 'path' IN OTHER ATTRIBUTE NAMES
config_path : "config/"
config_path : "bigbang/"
repo_path : "archives/sample-git-repos/"
mail_path : "archives/"
datatracker_path : "archives/datatracker"
Expand Down
File renamed without changes.
10 changes: 5 additions & 5 deletions bigbang/ingress/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import yaml
from bs4 import BeautifulSoup

from config.config import CONFIG
from bigbang.config import CONFIG
from bigbang.utils import get_paths_to_files_in_directory
import bigbang.bigbang_io as bio
from bigbang.data_types import Message, MailList
Expand Down Expand Up @@ -108,7 +108,7 @@ def create_email_message(

for key, value in header.items():
if "from" == key:
value = value.replace(" at ", '@')
value = value.replace(" at ", "@")

if "content-type" == key:
msg.set_param("Content-Type", value)
Expand Down Expand Up @@ -157,12 +157,12 @@ def from_url(
the Email. The latter is the default.
"""
soup = get_website_content(url, session=self.session)

if soup == "RequestException":
header = self.empty_header
body = "RequestException"
attachments = "RequestException"

else:
if fields in ["header", "total"]:
header = self._get_header_from_html(soup)
Expand All @@ -179,7 +179,7 @@ def from_url(
else:
body = None
attachments = None

return self.create_email_message(url, body, attachments, **header)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion bigbang/ingress/git_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from git import *
from git import Repo

from config.config import CONFIG
from bigbang.config import CONFIG

from bigbang import utils
from bigbang.analysis.entity_resolution import entity_resolve
Expand Down
2 changes: 1 addition & 1 deletion bigbang/ingress/listserv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import yaml
from bs4 import BeautifulSoup

from config.config import CONFIG
from bigbang.config import CONFIG

import bigbang.bigbang_io as bio
from bigbang.data_types import MailList
Expand Down
2 changes: 1 addition & 1 deletion bigbang/ingress/mailman.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import bigbang.archive as archive

from config.config import CONFIG
from bigbang.config import CONFIG

from . import listserv, w3c, pipermail
from .. import parse
Expand Down
90 changes: 48 additions & 42 deletions bigbang/ingress/pipermail.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import yaml
from bs4 import BeautifulSoup

from config.config import CONFIG
from bigbang.config import CONFIG

import bigbang.bigbang_io as bio
from bigbang.data_types import MailList
Expand Down Expand Up @@ -103,17 +103,23 @@ def from_pipermail_file(
fields: str = "total",
) -> mboxMessage:
""" """
header_start_line_nr = self.find_start_of_header(fcontent, header_end_line_nr)
header_start_line_nr = self.find_start_of_header(
fcontent, header_end_line_nr
)

if header_start_line_nr is None:
logger.info("The start of header in {list_name}" +\
" {header_end_line_nr} couldnt be found.")
print(f"The start of header in {list_name}" +\
f"{header_end_line_nr} couldnt be found.")
logger.info(
"The start of header in {list_name}"
+ " {header_end_line_nr} couldnt be found."
)
print(
f"The start of header in {list_name}"
+ f"{header_end_line_nr} couldnt be found."
)
archived_at = None
body = None
header = {}

else:
if fields in ["header", "total"]:
header = self._get_header_from_pipermail_file(
Expand All @@ -128,7 +134,7 @@ def from_pipermail_file(
else:
body = None
archived_at = f"{list_name}_line_nr_{header_start_line_nr}"

return self.create_email_message(archived_at, body, **header)

def _get_header_from_pipermail_file(
Expand All @@ -146,17 +152,17 @@ def _get_header_from_pipermail_file(
"""
fheader = fcontent[header_start_line_nr:header_end_line_nr]
header = {}

for lnr in range(len(fheader)):
line = fheader[lnr]
# get header keyword and value
if re.match(r"\S+:\s+\S+", line):
key = line.split(":")[0]
value = line.replace(key + ":", "").strip().rstrip("\n")
header[key.lower()] = value

return header

def _get_body_from_pipermail_file(
self,
fcontent: List[str],
Expand All @@ -169,37 +175,37 @@ def _get_body_from_pipermail_file(
# remove empty lines and join into one string
body = ("\n").join([line for line in body if len(line) > 1])
return body

def find_start_of_header(
self,
fcontent: List[str],
header_end_line_nr: int,
) -> int:
header_start_line_nr = None

for i in range(200): # 200 lines up just to make sure...
if fcontent[header_end_line_nr - i - 1] == '':
if fcontent[header_end_line_nr - i - 1] == "":
header_start_line_nr = header_end_line_nr - i + 1
break

return header_start_line_nr

def find_end_of_body(
self,
fcontent: List[str],
body_start_line_nr: int,
) -> int:
found = False
line_nr = body_start_line_nr + 2

while found is False:
line_nr += 1
if line_nr >= len(fcontent):
body_end_line_nr = -1
found = True
elif fcontent[line_nr].startswith('Message-ID:'):
elif fcontent[line_nr].startswith("Message-ID:"):
for i in range(200):
if 'From:' in fcontent[line_nr - i]:
if "From:" in fcontent[line_nr - i]:
body_end_line_nr = line_nr - i - 2
found = True
break
Expand Down Expand Up @@ -246,7 +252,7 @@ def from_url(
name: str,
url: str,
select: Optional[dict] = {"fields": "total"},
instant_save: Optional[bool]=True,
instant_save: Optional[bool] = True,
) -> "PipermailMailList":
"""Docstring in `AbstractMailList`."""
if "fields" not in list(select.keys()):
Expand Down Expand Up @@ -281,7 +287,7 @@ def from_period_urls(
url: str,
period_urls: List[str],
fields: str = "total",
instant_save: Optional[bool]=True,
instant_save: Optional[bool] = True,
) -> "PipermailMailList":
"""
Parameters
Expand All @@ -292,20 +298,22 @@ def from_period_urls(
for period_url in tqdm(period_urls, ascii=True, desc=name):
file = requests.get(
period_url,
verify=f"{directory_project}/config/icann_certificate.pem",
verify=os.path.join(
CONFIG.config_path, "icann_certificate.pem"
),
)

try:
fcontent = gzip.decompress(file.content).decode("utf-8")
except Exception:
print(f"File {period_url} in {name} could not be decoded")
continue
fcontent = fcontent.split('\n')

fcontent = fcontent.split("\n")
header_end_line_nrs = [
idx+1
idx + 1
for idx, fl in enumerate(fcontent)
if fl.startswith('Message-ID:')
if fl.startswith("Message-ID:")
]
for header_end_line_nr in header_end_line_nrs:
msgs.append(
Expand All @@ -315,10 +323,13 @@ def from_period_urls(
)
if (len(msgs) > 1e3) and (instant_save):
bio.mlist_to_mbox(
msgs, CONFIG.mail_path+"ICANN/", name, 'a',
msgs,
CONFIG.mail_path + "ICANN/",
name,
"a",
)
msgs = []

return cls(name, url, msgs)

@classmethod
Expand Down Expand Up @@ -384,36 +395,32 @@ def get_all_periods_and_their_urls(
time.sleep(0.5)
soup = get_website_content(
url,
verify=f"{directory_project}/config/icann_certificate.pem",
verify=os.path.join(CONFIG.config_path, "icann_certificate.pem"),
)
periods = []
urls_of_periods = []

if soup != "RequestException":
rows = soup.select(f'a[href*=".txt.gz"]')
for row in rows:
filename = row.get("href")
filename = row.get("href")
if filename.endswith(".txt.gz") is False:
continue
year = re.findall(r"\d{4}", filename)[0]
month = filename.split('.')[0].replace(f"{year}-", '')
month = filename.split(".")[0].replace(f"{year}-", "")
periods.append(f"{month} {year}")
urls_of_periods.append(url + "/" + filename)

return periods, urls_of_periods

@staticmethod
def get_name_from_url(url: str) -> str:
"""Get name of mailing list."""
return url.split('/')[-1]

return url.split("/")[-1]

class PipermailMailListDomain():


def __init__(
self, name: str, lists: List[Union[AbstractMailList, str]]
):
class PipermailMailListDomain:
def __init__(self, name: str, lists: List[Union[AbstractMailList, str]]):
self.name = name
self.lists = lists

Expand Down Expand Up @@ -461,7 +468,6 @@ def from_mailing_lists(
return cls(name, lists)



def text_for_selector(soup: BeautifulSoup, selector: str):
"""
Filter out header or body field from website and return them as utf-8 string.
Expand Down
Loading

0 comments on commit cb3f052

Please sign in to comment.