Skip to content

Commit

Permalink
Merge pull request #3 from WwwwwyDev/develop
Browse files Browse the repository at this point in the history
version 0.0.7
  • Loading branch information
WwwwwyDev committed Apr 30, 2024
2 parents 785f10f + ceb6c9d commit cea61c3
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 27 deletions.
2 changes: 1 addition & 1 deletion crawlist/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
# 88YbdP88 8P 88""" dP__Yb Yb 88"Yb dP__Yb Yb "88 88""
# 88 YY 88 dP 88 dP""""Yb YboodP 88 Yb dP""""Yb YboodP 888888

VERSION = (0, 0, 6)
VERSION = (0, 0, 7)

__version__ = '.'.join(map(str, VERSION))
43 changes: 27 additions & 16 deletions crawlist/analyzers/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,13 @@
from selenium.webdriver.common.options import ArgOptions
from selenium.webdriver.remote.webdriver import WebDriver
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver import DesiredCapabilities


class BaseDriver(object):
pass


class Browser:
"""
Set of supported locator strategies.
"""

Chrome = "chrome"
Firefox = "firefox"


class Driver(BaseDriver):

def get_driver(self) -> WebDriver:
Expand All @@ -30,30 +20,51 @@ def __call__(self) -> WebDriver:


class DefaultDriver(Driver):
def __init__(self, isDebug: bool = False):
self.isDebug = isDebug
def __init__(self, is_debug: bool = False, is_eager: bool = False):
self.is_debug = is_debug
self.is_eager = is_eager

def get_driver(self) -> WebDriver:
option = wd.ChromeOptions()
add_default_chrome_options(option)
if not self.isDebug:
if not self.is_debug:
option.add_argument("--headless")
if not self.is_eager:
option.page_load_strategy = 'eager'
option.add_experimental_option('excludeSwitches', ['enable-automation'])
webdriver = wd.Chrome(service=Service(ChromeDriverManager().install()), options=option)
webdriver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
webdriver.implicitly_wait(10)
return webdriver


class DefaultRemoteDriver(Driver):
def __init__(self, webdriver_url: str):
def __init__(self, webdriver_url: str, is_eager: bool = False):
self.webdriver_url = webdriver_url
self.is_eager = is_eager

def get_driver(self) -> WebDriver:
option = wd.ChromeOptions()
add_default_chrome_options(option)
option.add_argument("--headless")
if not self.is_eager:
option.page_load_strategy = 'eager'
option.set_capability('cloud:options', DesiredCapabilities.CHROME)
option.add_experimental_option('excludeSwitches', ['enable-automation'])
webdriver = wd.Remote(command_executor=self.webdriver_url, options=option)
webdriver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
webdriver.implicitly_wait(10)
return webdriver

Expand All @@ -66,8 +77,8 @@ def add_default_chrome_options(option: ArgOptions):
'User-Agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"',
"window-size=1920x3000",
"start-maximized",
'cache-control="max-age=0"'
'cache-control="max-age=0"',
"disable-blink-features=AutomationControlled"
]
for argument in arguments:
option.add_argument(argument)
option.page_load_strategy = 'eager'
27 changes: 19 additions & 8 deletions crawlist/analyzers/pager/dynamic_pager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import parsel
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver

Expand All @@ -11,15 +12,18 @@

class DynamicPager(Pager):
@check
def __init__(self, webdriver: Driver = None, interval: float = 0.1) -> None:
def __init__(self, webdriver: Driver | WebDriver = None, interval: float = 0.1) -> None:
"""
:param webdriver: WebDriver object for selenium
:param interval: Grab the list frequency and adjust it according to the actual situation of the webpage
"""
if not webdriver:
self.webdriver = DefaultDriver()()
else:
self.webdriver = webdriver()
if isinstance(webdriver, WebDriver):
self.webdriver = webdriver
else:
self.webdriver = webdriver()
super().__init__(interval=interval)

def click_safety(self, button: WebElement) -> None:
Expand Down Expand Up @@ -48,7 +52,7 @@ def __del__(self):

class DynamicRedirectPager(DynamicPager):
@check
def __init__(self, uri: str, uri_split: str, webdriver: Driver = None, start: int = 1, offset: int = 1,
def __init__(self, uri: str, uri_split: str, webdriver: Driver | WebDriver = None, start: int = 1, offset: int = 1,
interval: float = 0.1) -> None:
"""
Based on dynamic web page analyzer (redirect page flipping)
Expand Down Expand Up @@ -83,7 +87,7 @@ def html(self) -> str:

class DynamicListRedirectPager(DynamicPager):
@check
def __init__(self, uris: list, webdriver: Driver = None, interval: float = 0.1) -> None:
def __init__(self, uris: list, webdriver: Driver | WebDriver = None, interval: float = 0.1) -> None:
"""
Based on dynamic web page analyzer (redirect page flipping)
:param uris: A list containing multiple uris, executed in order downwards
Expand Down Expand Up @@ -116,7 +120,7 @@ def html(self) -> str:

class DynamicScrollPager(DynamicPager):
@check
def __init__(self, uri: str, webdriver: Driver = None, interval: float = 1) -> None:
def __init__(self, uri: str, webdriver: Driver | WebDriver = None, interval: float = 1) -> None:
"""
Based on dynamic web page analyzer (scrolling and flipping)
:param uri: webpage link, which is a scrolling page
Expand All @@ -140,6 +144,13 @@ def __init__(self, uri: str, webdriver: Driver = None, interval: float = 1) -> N
def next(self) -> None:
self.webdriver.execute_script(DynamicScrollPager.js_code)
self.sleep()
actions = ActionChains(self.webdriver)
actions.move_by_offset(0, 0).click().perform()
self.sleep()
for _ in range(5):
actions.send_keys(Keys.SPACE).perform()
self.sleep()
self.sleep()

@property
def html(self) -> str:
Expand All @@ -151,7 +162,7 @@ def pre_load(self, webdriver: WebDriver) -> None:

class DynamicLineButtonPager(DynamicPager):
@check
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: Driver = None,
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: Driver | WebDriver = None,
interval: float = 1) -> None:
"""
Based on dynamic web page analyzer (row button page flipping)
Expand Down Expand Up @@ -183,7 +194,7 @@ def pre_load(self, webdriver: WebDriver) -> None:

class DynamicNumButtonPager(DynamicPager):
@check
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: Driver = None, start: int = 1,
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: Driver | WebDriver = None, start: int = 1,
offset: int = 1, interval: float = 1) -> None:
"""
Based on dynamic web page analyzer (digital button flipping)
Expand Down Expand Up @@ -262,7 +273,7 @@ def pre_load(self, webdriver: WebDriver) -> None:

class DynamicNextButtonPager(DynamicPager):
@check
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: Driver = None, start: int = 1,
def __init__(self, uri: str, button_selector: WebElementSelector, webdriver: Driver | WebDriver = None, start: int = 1,
offset: int = 1, interval: float = 1) -> None:
"""
Based on dynamic web page analyzer (click the next page button to page)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
EMAIL = '[email protected]'
AUTHOR = 'WwyDev'
REQUIRES_PYTHON = '>=3.10.0'
VERSION = '0.0.6'
VERSION = '0.0.7'
# What packages are required for this module to be executed?
REQUIRED = [
'parsel', 'selenium>=4.0.0', 'cssselect', 'lxml', 'requests', 'webdriver-manager'
Expand Down
30 changes: 29 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,34 @@
from selenium.webdriver.chrome.service import Service


def getDriver(is_headless=False):
option = wd.ChromeOptions()
arguments = [
"no-sandbox",
"--disable-extensions",
'--disable-gpu',
'User-Agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"',
"window-size=1920x3000",
"start-maximized",
'cache-control="max-age=0"'
"disable-blink-features=AutomationControlled"
]
for argument in arguments:
option.add_argument(argument)
if is_headless:
option.add_argument("--headless")
option.add_experimental_option('excludeSwitches', ['enable-automation'])
webdriver = wd.Chrome(service=Service(ChromeDriverManager().install()), options=option)
webdriver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
return webdriver


class TestCase(unittest.TestCase):
limit = 100

Expand Down Expand Up @@ -257,7 +285,7 @@ def pre_load(self, webdriver: WebDriver) -> None:

pager = MyPager(uri="https://www.baidu.com/",
button_selector=cl.XpathWebElementSelector('//*[@id="page"]/div/a/span'),
webdriver=cl.DefaultDriver(isDebug=True), interval=5)
webdriver=cl.DefaultDriver(is_debug=True), interval=5)
selector = cl.XpathSelector(pattern='/html/body/div[3]/div[3]/div[1]/div[3]/div')
analyzer = cl.AnalyzerPrettify(pager, selector)
res = []
Expand Down

0 comments on commit cea61c3

Please sign in to comment.