-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper.py
executable file
·113 lines (98 loc) · 3.56 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
# Quick leetcode scraper
#
# Generate the source file of the question and pre-fixed head comment
#
# Usage: scraper.py [leetcode question url] [extension]
#
# Example: scraper.py http://.... .c
#
# Supported extension: .java, .c, .sql, .py
#
# Limitation: doesn't work for leetcode contest
from bs4 import BeautifulSoup
import requests
import sys
import re
import validators
import operator
import os
if __name__ == "__main__":
script, url, extension = sys.argv
if not validators.url(url):
print("Please enter a valid url!")
sys.exit()
commentBlock = "*"
pythonMode = False
if extension == ".py":
commentBlock = "#"
pythonMode = True
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "lxml")
# produce the directory name based upon leetcode question title
#
title_corp = soup.find_all("div", class_="question-title")
title_raw = title_corp[0].h3.get_text()
title_lines = title_raw.split('\n')
title_lines = list(filter(operator.methodcaller('strip'), title_lines))
title_rdy = title_lines[0].lstrip(' ').replace(".", "-").split(' ')
title = "".join(title_rdy)
# create the directory
#
path = "./leetcode/" + title
if not os.path.isdir(path):
os.mkdir(path)
# produce file name
#
pat = re.compile(r"^(\d+)-")
m = re.search(pat, title)
filename=title[:m.start()] + title[m.end():]
filename=filename[0].lower() + filename[1:]
target = open(path+"/"+filename+extension, "w")
# Fetch the question describtion
#
letters = soup.find_all("div", class_="question-description")
rawText = letters[0].get_text()
lines = rawText.split('\n')
lines = list(filter(None, lines))
# Construct the header comment of the file
#
if not pythonMode:
target.write("/" + commentBlock + " \n")
target.write(" " + commentBlock + " [Source] \n")
target.write(" " + commentBlock + " \n")
target.write(" " + commentBlock + " " + url + "\n")
target.write(" " + commentBlock + " \n")
target.write(" " + commentBlock + " [Problem Description]\n");
target.write(" " + commentBlock + " \n")
pat2 = re.compile(r"click|Credits")
for line in lines:
if "Subscribe" not in line:
if bool(re.search(pat2, line)):
pass
else:
# Remove CLR, "\n" if appears
line = line.replace("\r", "").replace("\n", "")
# Break a lone line into multiple lines
while(len(line) > 80):
# we want to keep line length around 80 and
# we don't want to break a word into two lines
# we also remove leading spaces of a line
nearest_space = line[:80].rfind(' ', 0, 80)
target.write(" " + commentBlock + " " + line[:nearest_space].lstrip(' ') + "\n")
line = line[nearest_space:]
target.write(" " + commentBlock + " " + line.lstrip(' ') + "\n")
else:
break
target.write(" " + commentBlock + " \n")
target.write(" " + commentBlock + " [Comments]\n");
target.write(" " + commentBlock + " \n")
target.write(" " + commentBlock + " \n")
target.write(" " + commentBlock + " \n")
target.write(" " + commentBlock + " [Companies]\n");
if not pythonMode:
target.write(" */ \n")
target.write("\n\n")
target.close()