Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Otel #1087

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft

Otel #1087

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ For more details, please read the
[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/).

## How to Report

For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.

<!--
Expand Down
3 changes: 1 addition & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ instance (without any instrumentation)
after automatically rebuilding `openwpm.xpi`. The script then
drops into an `ipython` shell where the webdriver instance is available
through variable `driver`.

* `python -m test.manual_test --selenium --no_extension` launches a Firefox Selenium
- `python -m test.manual_test --selenium --no_extension` launches a Firefox Selenium
instance with no instrumentation. The script then
drops into an `ipython` shell where the webdriver instance is available
through variable `driver`.
Expand Down
2,491 changes: 116 additions & 2,375 deletions Extension/package-lock.json

Large diffs are not rendered by default.

39 changes: 0 additions & 39 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
from types import FrameType
from typing import Any, Callable, List, Literal, Optional

import sentry_sdk

from openwpm import mp_logger
from openwpm.command_sequence import CommandSequence
from openwpm.config import BrowserParams, ManagerParams
Expand Down Expand Up @@ -52,8 +50,6 @@
SAVE_CONTENT = os.getenv("SAVE_CONTENT", "")
PREFS = os.getenv("PREFS", None)


SENTRY_DSN = os.getenv("SENTRY_DSN", None)
LOGGER_SETTINGS = mp_logger.parse_config_from_env()

if CALLSTACK_INSTRUMENT is True:
Expand Down Expand Up @@ -114,38 +110,6 @@
logger_kwargs=LOGGER_SETTINGS,
)

# At this point, Sentry should be initiated
if SENTRY_DSN:
# Add crawler.py-specific context
with sentry_sdk.configure_scope() as scope:
# tags generate breakdown charts and search filters
scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY)
scope.set_tag("GCS_BUCKET", GCS_BUCKET)
scope.set_tag("DISPLAY_MODE", DISPLAY_MODE)
scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT)
scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT)
scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT)
scope.set_tag("JS_INSTRUMENT", JS_INSTRUMENT)
scope.set_tag("JS_INSTRUMENT_SETTINGS", JS_INSTRUMENT_SETTINGS)
scope.set_tag("CALLSTACK_INSTRUMENT", CALLSTACK_INSTRUMENT)
scope.set_tag("SAVE_CONTENT", SAVE_CONTENT)
scope.set_tag("DWELL_TIME", DWELL_TIME)
scope.set_tag("TIMEOUT", TIMEOUT)
scope.set_tag("MAX_JOB_RETRIES", MAX_JOB_RETRIES)
scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (GCS_BUCKET, CRAWL_DIRECTORY))
# context adds addition information that may be of interest
if PREFS:
scope.set_context("PREFS", json.loads(PREFS))
scope.set_context(
"crawl_config",
{
"REDIS_QUEUE_NAME": REDIS_QUEUE_NAME,
},
)
# Send a sentry error message (temporarily - to easily be able
# to compare error frequencies to crawl worker instance count)
sentry_sdk.capture_message("Crawl worker started")

# Connect to job queue
job_queue = rediswq.RedisWQ(
name=REDIS_QUEUE_NAME, host=REDIS_HOST, max_retries=MAX_JOB_RETRIES
Expand Down Expand Up @@ -234,6 +198,3 @@ def callback(success: bool) -> None:
else:
manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
sentry_sdk.capture_message("Crawl worker finished")
41 changes: 23 additions & 18 deletions demo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import argparse
import os
from pathlib import Path
from typing import Literal

import tranco
from honeycomb.opentelemetry import HoneycombOptions, configure_opentelemetry
from opentelemetry import trace

from custom_command import LinkCountingCommand
from openwpm.command_sequence import CommandSequence
Expand All @@ -29,7 +32,6 @@
latest_list = t.list()
sites = ["http://" + x for x in latest_list.top(10)]


display_mode: Literal["native", "headless", "xvfb"] = "native"
if args.headless:
display_mode = "headless"
Expand Down Expand Up @@ -67,6 +69,7 @@
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

_tracer = trace.get_tracer(__name__)

# Commands time out by default after 60 seconds
with TaskManager(
Expand All @@ -77,23 +80,25 @@
) as manager:
# Visits the sites
for index, site in enumerate(sites):

def callback(success: bool, val: str = site) -> None:
print(
f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
with _tracer.start_as_current_span("command_issuing"):
span = trace.get_current_span()

def callback(success: bool, val: str = site) -> None:
print(
f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
)

# Parallelize sites over all number of browsers set above.
command_sequence = CommandSequence(
site,
site_rank=index,
callback=callback,
)

# Parallelize sites over all number of browsers set above.
command_sequence = CommandSequence(
site,
site_rank=index,
callback=callback,
)

# Start by visiting the page
command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60)
# Have a look at custom_command.py to see how to implement your own command
command_sequence.append_command(LinkCountingCommand())
# Start by visiting the page
command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60)
# Have a look at custom_command.py to see how to implement your own command
command_sequence.append_command(LinkCountingCommand())

# Run commands across all browsers (simple parallelization)
manager.execute_command_sequence(command_sequence)
# Run commands across all browsers (simple parallelization)
manager.execute_command_sequence(command_sequence)
11 changes: 6 additions & 5 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,34 @@ dependencies:
- leveldb=1.23
- multiprocess=0.70.16
- mypy=1.10.1
- nodejs=22.4.0
- nodejs=22.5.1
- pandas=2.2.2
- pillow=10.4.0
- pip=24.0
- plyvel=1.5.1
- pre-commit=3.7.1
- psutil=6.0.0
- pyarrow=16.1.0
- pytest-asyncio=0.23.7
- pytest-asyncio=0.23.8
- pytest-cov=5.0.0
- pytest=8.2.2
- pytest=8.3.1
- python=3.11.9
- pyvirtualdisplay=3.0
- recommonmark=0.7.1
- redis-py=5.0.7
- s3fs=2024.6.1
- selenium=4.22.0
- sentry-sdk=2.9.0
- sphinx-markdown-tables=0.0.17
- sphinx=7.3.7
- sphinx=7.4.7
- tabulate=0.9.0
- tblib=3.0.0
- wget=1.21.4
- pip:
- dataclasses-json==0.6.7
- domain-utils==0.7.1
- honeycomb-opentelemetry==0.5.0b0
- jsonschema==4.23.0
- opentelemetry-api==1.25.0
- tranco==0.8.1
- types-pyyaml==6.0.12.20240311
- types-redis==4.6.0.20240425
Expand Down
Loading
Loading