first commit
This commit is contained in:
commit
5ee90bca3d
|
@ -0,0 +1,3 @@
|
|||
venv
|
||||
.vscode
|
||||
__pycache__
|
|
@ -0,0 +1,14 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
from stolichki.parser import StolichkiParser
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO
|
||||
)
|
||||
|
||||
result = StolichkiParser().run()
|
||||
|
||||
with open("data.json", "w") as f:
|
||||
json.dump(result, f, indent=4, ensure_ascii=False)
|
|
@ -0,0 +1,18 @@
|
|||
attrs==23.1.0
|
||||
certifi==2023.7.22
|
||||
charset-normalizer==3.2.0
|
||||
exceptiongroup==1.1.3
|
||||
h11==0.14.0
|
||||
idna==3.4
|
||||
outcome==1.2.0
|
||||
PySocks==1.7.1
|
||||
requests==2.31.0
|
||||
selenium==4.12.0
|
||||
selenium-stealth==1.0.6
|
||||
sniffio==1.3.0
|
||||
sortedcontainers==2.4.0
|
||||
trio==0.22.2
|
||||
trio-websocket==0.10.4
|
||||
urllib3==2.0.4
|
||||
wsproto==1.2.0
|
||||
2captcha-python==1.2.1
|
|
@ -0,0 +1,118 @@
|
|||
import time
|
||||
import logging
|
||||
import uuid
|
||||
import os
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import (
|
||||
ElementNotVisibleException,
|
||||
NoSuchElementException,
|
||||
)
|
||||
|
||||
from selenium_stealth import stealth
|
||||
|
||||
from twocaptcha import TwoCaptcha
|
||||
|
||||
class CaptchaSolverError(Exception):
|
||||
def __init__(self, *args: object) -> None:
|
||||
super().__init__(*args)
|
||||
|
||||
class StolichkiDriver(webdriver.Chrome):
|
||||
def __init__(
|
||||
self, options: Options = None, service: Service = None, keep_alive: bool = True
|
||||
) -> None:
|
||||
|
||||
assert os.environ.get("TWOCAPTCA_KEY") is not None, "Can't fins environment variable TWOCAPTCHA_KEY"
|
||||
|
||||
if options is None:
|
||||
options = webdriver.ChromeOptions()
|
||||
|
||||
if not os.path.exists("errors"):
|
||||
os.mkdir("errors")
|
||||
|
||||
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
options.add_experimental_option("useAutomationExtension", False)
|
||||
options.page_load_strategy = "eager"
|
||||
|
||||
self.__solver = TwoCaptcha(os.environ.get("TWOCAPTCA_KEY"))
|
||||
|
||||
super().__init__(options, service, keep_alive)
|
||||
|
||||
stealth(
|
||||
self,
|
||||
languages=["en-US", "en"],
|
||||
vendor="Google Inc.",
|
||||
platform="Win32",
|
||||
webgl_vendor="Intel Inc.",
|
||||
renderer="Intel Iris OpenGL Engine",
|
||||
fix_hairline=True,
|
||||
)
|
||||
|
||||
def get(self, url: str) -> None:
|
||||
super().get(url)
|
||||
logging.info(f"Loading {url}")
|
||||
for attempt in range(5):
|
||||
logging.debug(f"Attempt: {attempt + 1} for {url}")
|
||||
|
||||
# Ждём 60 секунд, пока не появится логотип.
|
||||
# Если не появился, обновляем страницу и ждём ещё раз.
|
||||
# И так пять раз. Если за 5 попыток ничего не вышло, кидаем исключение
|
||||
if not self.__wait_for_presence('//img[@alt="Логотип"]'):
|
||||
self.__handle_captcha()
|
||||
self.execute_script("window.stop();")
|
||||
time.sleep(1)
|
||||
self.refresh()
|
||||
continue
|
||||
|
||||
return
|
||||
|
||||
id = str(uuid.uuid4())
|
||||
# Если страница не загрузилась, сохрняем скрин, ссылку и исходный код
|
||||
logging.critical(f"Can't reach to {url}.")
|
||||
self.get_screenshot_as_file(f"errors/{url}-{id}.png")
|
||||
with open(f"errors/{url}-{id}.html") as f:
|
||||
f.write(self.page_source)
|
||||
|
||||
raise TimeoutError("Can't reach website. Check your connection or query.")
|
||||
|
||||
|
||||
|
||||
def __wait_for_presence(self, xpath: str, delay: int = 60):
|
||||
wait = WebDriverWait(self, delay)
|
||||
try:
|
||||
wait.until(
|
||||
EC.presence_of_element_located(
|
||||
(By.XPATH, xpath)
|
||||
)
|
||||
)
|
||||
|
||||
logging.info("Loading element was founded")
|
||||
return True
|
||||
|
||||
except (NoSuchElementException, ElementNotVisibleException):
|
||||
return False
|
||||
|
||||
def __handle_captcha(self) -> None:
|
||||
for attempt in range(5):
|
||||
logging.info(f"Trying to solve captcha {attempt + 1}/5")
|
||||
try:
|
||||
captcha_image = self.find_element(By.ID, "captcha_image")
|
||||
except NoSuchElementException:
|
||||
logging.info("Can't find captcha image")
|
||||
return None
|
||||
|
||||
captcha_base64 = captcha_image.screenshot_as_base64
|
||||
captcha_text = self.__solver.normal(captcha_base64)
|
||||
|
||||
self.find_element(By.ID, "captcha_input").send_keys(captcha_text)
|
||||
self.find_element(By.ID, "submit_button").click()
|
||||
|
||||
if not self.__wait_for_presence('//img[@alt="Логотип"]', 60):
|
||||
continue
|
||||
|
||||
raise CaptchaSolverError()
|
|
@ -0,0 +1,109 @@
|
|||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from .browser import StolichkiDriver
|
||||
from .product import Product
|
||||
|
||||
|
||||
class StolichkiParser:
|
||||
city = {
|
||||
"id": 1,
|
||||
"name": "Москва",
|
||||
}
|
||||
|
||||
def __init__(self, city: dict | None = None) -> None:
|
||||
if city is not None:
|
||||
self.city = city
|
||||
|
||||
service = webdriver.ChromeService("/home/winet/.local/bin/chromedriver")
|
||||
self.driver = StolichkiDriver(service=service)
|
||||
logging.info(f"Parser initialize complete! City: {self.city.get('name')}")
|
||||
|
||||
def run(self):
|
||||
logging.info(f"Parser started. City: {self.city.get('name')}")
|
||||
self.driver.get("https://stolichki.ru/catalog")
|
||||
|
||||
categories_lists = self.driver.find_elements(
|
||||
By.CLASS_NAME, "categoryList__item"
|
||||
)
|
||||
|
||||
links = []
|
||||
|
||||
for category_list in categories_lists:
|
||||
link_tags = category_list.find_elements(
|
||||
By.CLASS_NAME, "catalogPreview__caption"
|
||||
)
|
||||
|
||||
links.extend(
|
||||
[
|
||||
link.get_attribute("href")
|
||||
for link in link_tags
|
||||
if link.get_attribute("href") is not None
|
||||
]
|
||||
)
|
||||
|
||||
logging.info(f"Finished parsing categories: Links: {links}")
|
||||
items = []
|
||||
for link in links:
|
||||
items.extend(self.__get_items(link))
|
||||
|
||||
return {
|
||||
"city": {
|
||||
"id": self.city.get("id"),
|
||||
"name": self.city.get("name"),
|
||||
},
|
||||
"items": items,
|
||||
}.copy()
|
||||
|
||||
def __get_items(self, url: str):
|
||||
items_list = []
|
||||
page = 1
|
||||
while True:
|
||||
try:
|
||||
self.driver.get(f"{url}?page={page}")
|
||||
except TimeoutError:
|
||||
continue
|
||||
|
||||
catalog_list = self.driver.find_element(By.ID, "catalog-list")
|
||||
product_items = catalog_list.find_elements(By.CLASS_NAME, "product-item")
|
||||
|
||||
if len(product_items) < 1:
|
||||
break
|
||||
|
||||
items = self.__parse_list(product_items)
|
||||
items_list.extend(items)
|
||||
|
||||
page += 1
|
||||
|
||||
return items_list
|
||||
|
||||
def __parse_list(self, product_items: list[WebElement]):
|
||||
data = []
|
||||
product_links: list[str] = []
|
||||
|
||||
for product_item in product_items:
|
||||
product_links.append(
|
||||
product_item.find_element(
|
||||
By.XPATH, './/p[contains(@class,"product-title")]/a'
|
||||
).get_attribute("href")
|
||||
)
|
||||
logging.info(f"Links in product list parsed. Links: {product_links}")
|
||||
for product_link in product_links:
|
||||
try:
|
||||
product = Product(self.driver, product_link).get_dict()
|
||||
except:
|
||||
continue
|
||||
|
||||
logging.info(f"{product} was parsed.")
|
||||
data.append(product)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def set_city(self, id: int):
|
||||
# TODO: Написать смену города путём заменой значения в куки браузера
|
||||
pass
|
|
@ -0,0 +1,81 @@
|
|||
import logging
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from .browser import StolichkiDriver
|
||||
|
||||
|
||||
class Product:
|
||||
id: int = 0
|
||||
name: str = ""
|
||||
available: bool = False
|
||||
stores: list = []
|
||||
|
||||
def __init__(self, driver: StolichkiDriver, url: str) -> None:
|
||||
self.driver = driver
|
||||
self.url = url
|
||||
|
||||
self.driver.get(self.url)
|
||||
self.__parse_page()
|
||||
|
||||
def get_dict(self):
|
||||
return {
|
||||
"id": self.id,
|
||||
"name": self.name,
|
||||
"available": self.available,
|
||||
"stores": self.stores,
|
||||
}.copy()
|
||||
|
||||
def __parse_page(self):
|
||||
self.name = (
|
||||
self.driver.find_element(By.XPATH, '//h1[@itemprop="name"]')
|
||||
.text.removeprefix("Купить")
|
||||
.strip()
|
||||
)
|
||||
|
||||
self.id = int(self.url.removeprefix("https://stolichki.ru/drugs/"))
|
||||
|
||||
try:
|
||||
stores = self.__parse_stores()
|
||||
if len(stores) > 0:
|
||||
self.available = True
|
||||
self.stores = stores
|
||||
except:
|
||||
logging.critical("Can't get info about stores")
|
||||
|
||||
def __parse_stores(self):
|
||||
try:
|
||||
self.driver.find_element(By.CSS_SELECTOR, "p.badge-class.product-not-found")
|
||||
return []
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
self.driver.find_element(By.CSS_SELECTOR, "a.stores-stock.stores-order.package")
|
||||
return []
|
||||
except:
|
||||
pass
|
||||
|
||||
self.driver.find_element(By.CLASS_NAME, "stores-stock").click()
|
||||
wait = WebDriverWait(self.driver, 30)
|
||||
|
||||
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tr-start-store")))
|
||||
|
||||
stores = self.driver.find_elements(By.CLASS_NAME, "tr-start-store")
|
||||
stores_list = []
|
||||
for store in stores:
|
||||
try:
|
||||
store_name = store.find_element(By.CLASS_NAME, "store-link").text
|
||||
number_of_product = int(store.find_element(By.CLASS_NAME, "part-quantity").text)
|
||||
|
||||
stores_list.append({
|
||||
"name": store_name,
|
||||
"quantity": number_of_product
|
||||
}.copy())
|
||||
|
||||
except:
|
||||
continue
|
||||
|
||||
return stores_list
|
Loading…
Reference in New Issue