first commit

2023-12-12 16:13:58 +10:00 · 2023-12-12 16:13:58 +10:00 · 5ee90bca3d
commit 5ee90bca3d
7 changed files with 343 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 venv
 .vscode
 __pycache__
--- a/main.py
+++ b/main.py
@ -0,0 +1,14 @@
 import json
 import logging
 from stolichki.parser import StolichkiParser
 if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO
    )
    result = StolichkiParser().run()
    with open("data.json", "w") as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,18 @@
 attrs==23.1.0
 certifi==2023.7.22
 charset-normalizer==3.2.0
 exceptiongroup==1.1.3
 h11==0.14.0
 idna==3.4
 outcome==1.2.0
 PySocks==1.7.1
 requests==2.31.0
 selenium==4.12.0
 selenium-stealth==1.0.6
 sniffio==1.3.0
 sortedcontainers==2.4.0
 trio==0.22.2
 trio-websocket==0.10.4
 urllib3==2.0.4
 wsproto==1.2.0
 2captcha-python==1.2.1
--- a/stolichki/init.py
+++ b/stolichki/init.py
--- a/stolichki/browser.py
+++ b/stolichki/browser.py
@ -0,0 +1,118 @@
 import time
 import logging
 import uuid
 import os
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import (
    ElementNotVisibleException,
    NoSuchElementException,
 )
 from selenium_stealth import stealth
 from twocaptcha import TwoCaptcha
 class CaptchaSolverError(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
 class StolichkiDriver(webdriver.Chrome):
    def __init__(
        self, options: Options = None, service: Service = None, keep_alive: bool = True
    ) -> None:
        assert os.environ.get("TWOCAPTCA_KEY") is not None, "Can't fins environment variable TWOCAPTCHA_KEY"
        if options is None:
            options = webdriver.ChromeOptions()
        if not os.path.exists("errors"):
            os.mkdir("errors")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)
        options.page_load_strategy = "eager"
        self.__solver = TwoCaptcha(os.environ.get("TWOCAPTCA_KEY"))
        super().__init__(options, service, keep_alive)
        stealth(
            self,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
        )
    def get(self, url: str) -> None:
        super().get(url)
        logging.info(f"Loading {url}")
        for attempt in range(5):
            logging.debug(f"Attempt: {attempt + 1} for {url}")
            # Ждём 60 секунд, пока не появится логотип.
            # Если не появился, обновляем страницу и ждём ещё раз.
            # И так пять раз. Если за 5 попыток ничего не вышло, кидаем исключение
            if not self.__wait_for_presence('//img[@alt="Логотип"]'):
                self.__handle_captcha()
                self.execute_script("window.stop();")
                time.sleep(1)
                self.refresh()
                continue
            return
        id = str(uuid.uuid4())
        # Если страница не загрузилась, сохрняем скрин, ссылку и исходный код
        logging.critical(f"Can't reach to {url}.")
        self.get_screenshot_as_file(f"errors/{url}-{id}.png")
        with open(f"errors/{url}-{id}.html") as f:
            f.write(self.page_source)
        raise TimeoutError("Can't reach website. Check your connection or query.")
    def __wait_for_presence(self, xpath: str, delay: int = 60):
            wait = WebDriverWait(self, delay)
            try:
                wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, xpath)
                   )
                )
                logging.info("Loading element was founded")
                return True
            except (NoSuchElementException, ElementNotVisibleException):
                return False
    def __handle_captcha(self) -> None:
        for attempt in range(5):
            logging.info(f"Trying to solve captcha {attempt + 1}/5")
            try:
                captcha_image = self.find_element(By.ID, "captcha_image")
            except NoSuchElementException:
                logging.info("Can't find captcha image")
                return None
            captcha_base64 = captcha_image.screenshot_as_base64
            captcha_text = self.__solver.normal(captcha_base64)
            self.find_element(By.ID, "captcha_input").send_keys(captcha_text)
            self.find_element(By.ID, "submit_button").click()
            if not self.__wait_for_presence('//img[@alt="Логотип"]', 60):
                continue
        raise CaptchaSolverError()
--- a/stolichki/parser.py
+++ b/stolichki/parser.py
@ -0,0 +1,109 @@
 import logging
 from multiprocessing import Pool
 from selenium import webdriver
 from selenium.webdriver.remote.webelement import WebElement
 from selenium.webdriver.common.by import By
 from .browser import StolichkiDriver
 from .product import Product
 class StolichkiParser:
    city = {
        "id": 1,
        "name": "Москва",
    }
    def __init__(self, city: dict | None = None) -> None:
        if city is not None:
            self.city = city
        service = webdriver.ChromeService("/home/winet/.local/bin/chromedriver")
        self.driver = StolichkiDriver(service=service)
        logging.info(f"Parser initialize complete! City: {self.city.get('name')}")
    def run(self):
        logging.info(f"Parser started. City: {self.city.get('name')}")
        self.driver.get("https://stolichki.ru/catalog")
        categories_lists = self.driver.find_elements(
            By.CLASS_NAME, "categoryList__item"
        )
        links = []
        for category_list in categories_lists:
            link_tags = category_list.find_elements(
                By.CLASS_NAME, "catalogPreview__caption"
            )
            links.extend(
                [
                    link.get_attribute("href")
                    for link in link_tags
                    if link.get_attribute("href") is not None
                ]
            )
        logging.info(f"Finished parsing categories: Links: {links}")
        items = []
        for link in links:
            items.extend(self.__get_items(link))
        return {
            "city": {
                "id": self.city.get("id"),
                "name": self.city.get("name"),
            },
            "items": items,
        }.copy()
    def __get_items(self, url: str):
        items_list = []
        page = 1
        while True:
            try:
                self.driver.get(f"{url}?page={page}")
            except TimeoutError:
                continue
            catalog_list = self.driver.find_element(By.ID, "catalog-list")
            product_items = catalog_list.find_elements(By.CLASS_NAME, "product-item")
            if len(product_items) < 1:
                break
            items = self.__parse_list(product_items)
            items_list.extend(items)
            page += 1
        return items_list
    def __parse_list(self, product_items: list[WebElement]):
        data = []
        product_links: list[str] = []
        for product_item in product_items:
            product_links.append(
                product_item.find_element(
                    By.XPATH, './/p[contains(@class,"product-title")]/a'
                ).get_attribute("href")
            )
        logging.info(f"Links in product list parsed. Links: {product_links}")
        for product_link in product_links:
            try:
                product = Product(self.driver, product_link).get_dict()
            except:
                continue
            logging.info(f"{product} was parsed.")
            data.append(product)
        return data
    def set_city(self, id: int):
        # TODO: Написать смену города путём заменой значения в куки браузера
        pass
--- a/stolichki/product.py
+++ b/stolichki/product.py
@ -0,0 +1,81 @@
 import logging
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from .browser import StolichkiDriver
 class Product:
    id: int = 0
    name: str = ""
    available: bool = False
    stores: list = []
    def __init__(self, driver: StolichkiDriver, url: str) -> None:
        self.driver = driver
        self.url = url
        self.driver.get(self.url)
        self.__parse_page()
    def get_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "available": self.available,
            "stores": self.stores,
        }.copy()
    def __parse_page(self):
        self.name = (
            self.driver.find_element(By.XPATH, '//h1[@itemprop="name"]')
            .text.removeprefix("Купить")
            .strip()
        )
        self.id = int(self.url.removeprefix("https://stolichki.ru/drugs/"))
        try:
            stores = self.__parse_stores()
            if len(stores) > 0:
                self.available = True
                self.stores = stores
        except:
            logging.critical("Can't get info about stores")
    def __parse_stores(self):
        try:
            self.driver.find_element(By.CSS_SELECTOR, "p.badge-class.product-not-found")
            return []
        except:
            pass
        try:
            self.driver.find_element(By.CSS_SELECTOR, "a.stores-stock.stores-order.package")
            return []
        except:
            pass
        self.driver.find_element(By.CLASS_NAME, "stores-stock").click()
        wait = WebDriverWait(self.driver, 30)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tr-start-store")))
        stores = self.driver.find_elements(By.CLASS_NAME, "tr-start-store")
        stores_list = []
        for store in stores:
            try:
                store_name = store.find_element(By.CLASS_NAME, "store-link").text
                number_of_product = int(store.find_element(By.CLASS_NAME, "part-quantity").text)
                stores_list.append({
                    "name": store_name,
                    "quantity": number_of_product 
                }.copy())
            except:
                continue
        return stores_list