Skip to content

Commit

Permalink
fix get data error (#1793)
Browse files Browse the repository at this point in the history
* fix get data error

* fix get v0 data error

* optimize get_data code

* fix pylint error

* add comments
  • Loading branch information
SunsetWolf authored May 24, 2024
1 parent 6302101 commit 155f803
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 26 deletions.
5 changes: 3 additions & 2 deletions examples/orderbook_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Current version of script with default value tries to connect localhost **via de

Run following command to install necessary libraries
```
pip install pytest coverage
pip install pytest coverage gdown
pip install arctic # NOTE: pip may fail to resolve the right package dependency !!! Please make sure the dependency are satisfied.
```

Expand All @@ -27,7 +27,8 @@ pip install arctic # NOTE: pip may fail to resolve the right package dependency
2. Please follow following steps to download example data
```bash
cd examples/orderbook_data/
python ../../scripts/get_data.py download_data --target_dir . --file_name highfreq_orderbook_example_data.zip
gdown https://drive.google.com/uc?id=15nZF7tFT_eKVZAcMFL1qPS4jGyJflH7e # Proxies may be necessary here.
python ../../scripts/get_data.py _unzip --file_path highfreq_orderbook_example_data.zip --target_dir .
```

3. Please import the example data to your mongo db
Expand Down
68 changes: 44 additions & 24 deletions qlib/tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,11 @@
from tqdm import tqdm
from pathlib import Path
from loguru import logger
from cryptography.fernet import Fernet
from qlib.utils import exists_qlib_data


class GetData:
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
# "?" is not included in the token.
TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
REMOTE_URL = "https://github.com/SunsetWolf/qlib_dataset/releases/download"

def __init__(self, delete_zip_file=False):
"""
Expand All @@ -33,9 +29,45 @@ def __init__(self, delete_zip_file=False):
self.delete_zip_file = delete_zip_file

def merge_remote_url(self, file_name: str):
fernet = Fernet(self.KEY)
token = fernet.decrypt(self.TOKEN).decode()
return f"{self.REMOTE_URL}/{file_name}?{token}"
"""
Generate download links.
Parameters
----------
file_name: str
The name of the file to be downloaded.
The file name can be accompanied by a version number, (e.g.: v2/qlib_data_simple_cn_1d_latest.zip),
if no version number is attached, it will be downloaded from v0 by default.
"""
return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}"

def download(self, url: str, target_path: [Path, str]):
"""
Download a file from the specified url.
Parameters
----------
url: str
The url of the data.
target_path: str
The location where the data is saved, including the file name.
"""
file_name = str(target_path).rsplit("/", maxsplit=1)[-1]
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
if resp.status_code != 200:
raise requests.exceptions.HTTPError()

chunk_size = 1024
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
logger.info(f"{os.path.basename(file_name)} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chunk in resp.iter_content(chunk_size=chunk_size):
fp.write(chunk)
p_bar.update(chunk_size)

def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
"""
Expand Down Expand Up @@ -70,21 +102,7 @@ def download_data(self, file_name: str, target_dir: [Path, str], delete_old: boo
target_path = target_dir.joinpath(_target_file_name)

url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
if resp.status_code != 200:
raise requests.exceptions.HTTPError()

chunk_size = 1024
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
logger.info(f"{os.path.basename(file_name)} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chunk in resp.iter_content(chunk_size=chunk_size):
fp.write(chunk)
p_bar.update(chunk_size)
self.download(url=url, target_path=target_path)

self._unzip(target_path, target_dir, delete_old)
if self.delete_zip_file:
Expand All @@ -99,7 +117,9 @@ def check_dataset(self, file_name: str):
return status

@staticmethod
def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True):
def _unzip(file_path: [Path, str], target_dir: [Path, str], delete_old: bool = True):
file_path = Path(file_path)
target_dir = Path(target_dir)
if delete_old:
logger.warning(
f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}"
Expand Down

0 comments on commit 155f803

Please sign in to comment.