Skip to content

Commit

Permalink
version 0.0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
WwwwwyDev committed Apr 13, 2024
1 parent 27903d4 commit 772dd57
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 102 deletions.
2 changes: 1 addition & 1 deletion crawlist/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
# 88YbdP88 8P 88""" dP__Yb Yb 88"Yb dP__Yb Yb "88 88""
# 88 YY 88 dP 88 dP""""Yb YboodP 88 Yb dP""""Yb YboodP 888888

VERSION = (0, 0, 2)
VERSION = (0, 0, 3)

__version__ = '.'.join(map(str, VERSION))
25 changes: 11 additions & 14 deletions crawlist/analyzers/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@

class BaseAnalyzer(object):
"""
分析器接口,需要子类方法实现
Analyzer interface, requires subclass method implementation
"""

def list(self, limit: int) -> Generator[Any, Any, None]:
"""
列表生成器接口,每次生成一条数据
:param limit: 取最新的条数
:return: 抓取到的格式化list
List generator interface, generating one data at a time
:param limit: Take the latest number of entries
:return: Captured formatted list
"""
for element in self.crawl(limit=limit):
yield self.after(element)
Expand All @@ -30,15 +30,12 @@ def after(self, html: str) -> Any:


class Analyzer(BaseAnalyzer):
"""
分析器
"""

def __init__(self, pager: Pager, selector: Selector) -> None:
"""
分析器
:param pager: 分页器(Pagination对象)
:param selector: 抽取list的选择器(Selector对象)
Achieve linkage between pagers and selectors
:param pager: Pager (Pager object or its subclass implementation)
:param selector: Selector (Selector object or its subclass implementation)
"""
self.pager: Pager = pager
self.selector: Selector = selector
Expand All @@ -51,7 +48,7 @@ def crawl(self, limit: int) -> Generator[Any, str, None]:
return
res: list[str] = self.selector(html)
cnt = 0
while cnt < len(res) and limit: # 生成数据
while cnt < len(res) and limit: # Generate data
element = res[cnt]
if element not in res_set:
res_set.add(element)
Expand All @@ -67,7 +64,7 @@ def crawl(self, limit: int) -> Generator[Any, str, None]:
res = self.selector(html)
flag = False
cnt = 0
while cnt < len(res) and limit: # 生成数据
while cnt < len(res) and limit: # Generate data
element = res[cnt]
if element not in res_set:
flag = True
Expand All @@ -90,7 +87,7 @@ def after(self, html: str) -> str:

class AnalyzerPrettify(Analyzer):
"""
分析器,美化输出
Analyzer, beautify output
"""

filter_list = ["\n", "\r", "\t", "<br>", "<br/>", "</br>"]
Expand Down Expand Up @@ -120,7 +117,7 @@ def after(self, html: str) -> str:

class AnalyzerLinks(Analyzer):
"""
分析器,提取所有链接
Analyzer, extract all links
"""
url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

Expand Down
76 changes: 38 additions & 38 deletions crawlist/analyzers/dynamic_pager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
class DynamicPager(Pager):
def __init__(self, webdriver: WebDriver = None, interval: float = 0.1) -> None:
"""
:param webdriver: selenium的WebDriver对象
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
:param webdriver: WebDriver object for selenium
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
if not webdriver:
option = wd.ChromeOptions()
Expand All @@ -30,8 +30,8 @@ def __init__(self, webdriver: WebDriver = None, interval: float = 0.1) -> None:

def click_safety(self, button: WebElement) -> None:
"""
尝试多次点击按钮
:param button: 按钮元素
Attempt to click the button multiple times
:param button: Button elements
"""
# 点击失败后多次尝试点击
for _ in range(3):
Expand All @@ -56,13 +56,13 @@ class DynamicRedirectPager(DynamicPager):
def __init__(self, uri: str, uri_split: str, webdriver: WebDriver = None, start: int = 1, offset: int = 1,
interval: float = 0.1) -> None:
"""
基于动态网页分析器(重定向翻页)
:param uri: 第一页链接
:param uri_split: 链接分页(使用%v代理) example:https://www.boc.cn/sourcedb/whpj/index_%v.html
:param webdriver: selenium的WebDriver对象
:param start: 起始页
:param offset: 分页间隔
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on dynamic web page analyzer (redirect page flipping)
:param uri: First page link
:param uri_split: Link pagination (using% v proxy) Example: https://www.boc.cn/sourcedb/whpj/index_%v.html
:param webdriver: WebDriver object for selenium
:param start: Start page
:param offset: pagination interval
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert '%v' in uri_split
assert Valid.is_valid_url(uri) and Valid.is_valid_url(uri_split.replace('%v', str(start)))
Expand All @@ -88,10 +88,10 @@ def html(self) -> str:
class DynamicListRedirectPager(DynamicPager):
def __init__(self, uris: list, webdriver: WebDriver = None, interval: float = 0.1) -> None:
"""
基于动态网页分析器(重定向翻页)
:param uris: 含多个uri的list,按照顺序往下执行
:param webdriver: selenium的WebDriver对象
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on dynamic web page analyzer (redirect page flipping)
:param uris: A list containing multiple uris, executed in order downwards
:param webdriver: WebDriver object for selenium
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert isinstance(uris, list)
for uri in uris:
Expand Down Expand Up @@ -120,10 +120,10 @@ def html(self) -> str:
class DynamicScrollPager(DynamicPager):
def __init__(self, uri: str, webdriver: WebDriver = None, interval: float = 1) -> None:
"""
基于动态网页分析器(滚动翻页)
:param uri: 网页链接,该网页是滚动翻页
:param webdriver: selenium的WebDriver对象
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on dynamic web page analyzer (scrolling and flipping)
:param uri: webpage link, which is a scrolling page
:param webdriver: WebDriver object for selenium
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert Valid.is_valid_url(uri)
super().__init__(webdriver=webdriver, interval=interval)
Expand Down Expand Up @@ -152,11 +152,11 @@ class DynamicLineButtonPager(DynamicPager):
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: WebDriver = None,
interval: float = 1) -> None:
"""
基于动态网页分析器(行按钮翻页)
:param uri: 网页链接,该网页是行按钮翻页
:param button_selector: 行按钮选择器
:param webdriver: selenium的WebDriver对象
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on dynamic web page analyzer (row button page flipping)
:param uri: webpage link, which is a row button for flipping pages
:param button.selector: row button selector
:param webdriver: WebDriver object for selenium
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert Valid.is_valid_url(uri)
super().__init__(webdriver=webdriver, interval=interval)
Expand All @@ -180,13 +180,13 @@ class DynamicNumButtonPager(DynamicPager):
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: WebDriver = None, start: int = 1,
offset: int = 1, interval: float = 1) -> None:
"""
基于动态网页分析器(数字按钮翻页)
:param uri: 网页链接,该网页是数字按钮翻页
:param button_selector: 数字按钮选择器
:param webdriver: selenium的WebDriver对象
:param start: 起始页
:param offset: 分页间隔
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on dynamic web page analyzer (digital button flipping)
:param uri: webpage link, which is a numeric button for flipping pages
:param button.selector: numeric button selector
:param webdriver: WebDriver object for selenium
:param start: Start page
:param offset: pagination interval
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert Valid.is_valid_url(uri)
super().__init__(webdriver=webdriver, interval=interval)
Expand Down Expand Up @@ -255,13 +255,13 @@ class DynamicNextButtonPager(DynamicPager):
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: WebDriver = None, start: int = 1,
offset: int = 1, interval: float = 1) -> None:
"""
基于动态网页分析器(点击下一页按钮翻页)
:param uri: 网页链接,该网页是点击下一页按钮翻页
:param button_selector: 点击下一页按钮选择器
:param webdriver: selenium的WebDriver对象
:param start: 起始页
:param offset: 分页间隔
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on dynamic web page analyzer (click the next page button to page)
:param uri: Web page link, which is a page that can be flipped by clicking the next page button
:param button.selector: Click on the next page button selector
:param webdriver: WebDriver object for selenium
:param start: Start page
:param offset: pagination interval
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert Valid.is_valid_url(uri)
super().__init__(webdriver=webdriver, interval=interval)
Expand Down
9 changes: 4 additions & 5 deletions crawlist/analyzers/pager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,22 @@ class Pager(BasePager):

def __init__(self, interval: float = 0.1):
"""
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
self.interval: float = interval
self.half_interval: float = interval / 2

def next(self) -> None:
"""
数据增量方法
:return:
Data Incremental Method
"""
raise NotImplementedError

@property
def html(self) -> str:
"""
当前状态的html文本
:return:
HTML text in the current state
:return: The html text
"""
raise NotImplementedError

Expand Down
2 changes: 1 addition & 1 deletion crawlist/analyzers/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class BaseRequest(object):

class Request(BaseRequest):
"""
http请求对象,如果需要重写,请继承Request对象
HTTP request object, if it needs to be rewritten, please inherit the Request object
"""

def request(self, uri: str) -> str:
Expand Down
14 changes: 7 additions & 7 deletions crawlist/analyzers/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class BaseSelector(object):
class Selector(BaseSelector):
def __init__(self, pattern: str) -> None:
"""
选择器
:param pattern: 抓取规则
Selector
:param pattern: Grab rules
"""
assert self.valid(pattern)
self.pattern = pattern
Expand All @@ -34,8 +34,8 @@ def __call__(self, html: str) -> list[str]:
class WebElementSelector(BaseSelector):
def __init__(self, pattern: str) -> None:
"""
webElement选择器(selenium)
:param pattern: 抓取规则
WebElement selector (selenium)
:param pattern: Grab rules
"""
assert self.valid(pattern)
self.pattern = pattern
Expand All @@ -52,7 +52,7 @@ def __call__(self, webdriver: WebDriver, interval: float = 0.1) -> list[WebEleme

class CssSelector(Selector):
"""
css选择器
css selector
"""

def select(self, html: str) -> list[str]:
Expand All @@ -64,7 +64,7 @@ def valid(self, pattern) -> bool:

class XpathSelector(Selector):
"""
xpath选择器
xpath selector
"""

def select(self, html: str) -> list[str]:
Expand All @@ -76,7 +76,7 @@ def valid(self, pattern) -> bool:

class RegexSelector(Selector):
"""
正则表达式选择器
regex selector
"""

def select(self, html: str) -> list[str]:
Expand Down
26 changes: 13 additions & 13 deletions crawlist/analyzers/static_pager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
class StaticPager(Pager):
def __init__(self, request: Request = None, interval: float = 0.1):
"""
:param request: 请求对象
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
:param request: Request object
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
if not request:
self.request = Request()
Expand All @@ -20,13 +20,13 @@ class StaticRedirectPager(StaticPager):
def __init__(self, uri: str, uri_split: str, request: Request = None, start: int = 1, offset: int = 1,
interval: float = 0.1) -> None:
"""
基于静态网页分析器(重定向翻页)
:param uri: 第一页链接
:param uri_split: 链接分页(使用%v代替) example:https://www.boc.cn/sourcedb/whpj/index_%v.html
:param request: 请求对象
:param start: 起始页
:param offset: 分页间隔
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on static web page analyzer (redirect page flipping)
:param uri: First page link
:param uri_split: Link pagination (using %v instead) Example: https://www.boc.cn/sourcedb/whpj/index_%v.html
:param request: Request object
:param start: Start page
:param offset: pagination interval
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert '%v' in uri_split
assert Valid.is_valid_url(uri) and Valid.is_valid_url(uri_split.replace('%v', str(start)))
Expand All @@ -50,10 +50,10 @@ def html(self) -> str:
class StaticListRedirectPager(StaticPager):
def __init__(self, uris: list, request: Request = None, interval: float = 0.1) -> None:
"""
基于静态网页分析器(重定向翻页)
:param uris: 含多个uri的list,按照顺序往下执行
:param request: 请求对象
:param interval: 抓取list频率,可使用self.sleep()方法控制频率
Based on static web page analyzer (redirect page flipping)
:param uris: A list containing multiple uris, executed in order downwards
:param request: Request object
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
assert isinstance(uris, list)
for uri in uris:
Expand Down
Loading

0 comments on commit 772dd57

Please sign in to comment.