From fe16d4eee9953d8ab99a268b7eed48b62c8838ce Mon Sep 17 00:00:00 2001 From: Anatoly Bogomolov Date: Fri, 5 Jan 2024 15:52:44 +1000 Subject: [PATCH] Testing --- crawler/api.py | 12 +++++------ crawler/types.py | 4 +++- main.py | 43 ++++++++++++++++++++++++++++++++++----- stolichki/driver.py | 3 ++- stolichki/errors.py | 4 ---- stolichki/parsers/city.py | 4 +++- 6 files changed, 52 insertions(+), 18 deletions(-) diff --git a/crawler/api.py b/crawler/api.py index 084a6da..bf038ab 100644 --- a/crawler/api.py +++ b/crawler/api.py @@ -1,16 +1,16 @@ import asyncio import configparser import random -import logging +from loguru import logger import aiohttp import backoff -from utils.exceptions import ConfigError +from crawler.utils.exceptions import ConfigError from crawler.types import City, Proxy -from utils.classes import Singleton +from crawler.utils.classes import Singleton -log = logging.getLogger(__name__) +log = logger class CrawlerAPI(metaclass=Singleton): api_baseurl = "https://q.asburo.ru/ch/" @@ -94,7 +94,7 @@ class CrawlerAPI(metaclass=Singleton): @backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_tries=15, logger=log) async def send_products(self, results: list): - log.info("Sending data") + log.info("Sending data...") url = f"{self.api_url}/prices/{self.rival_tag}" @@ -105,7 +105,7 @@ class CrawlerAPI(metaclass=Singleton): response = await self.session.post(url, json=data, auth=self.auth) status, response_text = response.status, await response.text() - log.info(f"{data} was sended. Status: {status}. Response: {response_text}") + log.debug(f"{data} was sended. Status: {status}. Response: {response_text}") if status >= 500: await asyncio.sleep(15) diff --git a/crawler/types.py b/crawler/types.py index 1622bd0..f6d745d 100644 --- a/crawler/types.py +++ b/crawler/types.py @@ -1,11 +1,13 @@ from dataclasses import dataclass -from utils.classes import DataclassBase +from crawler.utils.classes import DataclassBase @dataclass(init=False) class City(DataclassBase): + id: int = 0 city: str region_id: int | None region_name: str | None + is_byapt: 1 | 0 @dataclass(init=False) class Proxy(DataclassBase): diff --git a/main.py b/main.py index 4bc9361..c941467 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,50 @@ import sys +import asyncio +import configparser +import json +import multiprocessing from loguru import logger from stolichki.parsers.city import CityParser from stolichki.types.city import City +from crawler.api import CrawlerAPI +from crawler.utils.classes import DataclassJSONEncoder +from crawler.types import City, Proxy + +async def get_crawler_data(): + crawler = CrawlerAPI() + cities = await crawler.get_cities() + proxies = await crawler.get_proxies() + return cities, proxies + +def filter_cities(config, cities: list[City]) -> list[City]: + + with open(config["parser"]["cities_path"]) as f: + cities_stolichki = json.load(f) + + for city in cities: + city.id = cities_stolichki[city.city] + + return cities @logger.catch -def main(): - city = City(111, "Бутово", 1, []) - result = CityParser(city).parse() - print(result) +def main(cities: list[City], proxies: list[Proxy]): + config = configparser.ConfigParser() + config.read("config.ini") + + cities = filter_cities(config, cities) + quantity = config['parser']['cities_quantity'] or len(cities) + + with multiprocessing.Pool(processes=quantity) as pool: #type: ignore + results = pool.map(lambda city: CityParser(city, proxies).parse(), cities) + + with open("results.json", "w") as f: + json.dump(results, f, cls=DataclassJSONEncoder, ensure_ascii=False, indent=4) if __name__ == "__main__": logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore - main() \ No newline at end of file + + cities, proxies = asyncio.run(get_crawler_data()) + main(cities, proxies) \ No newline at end of file diff --git a/stolichki/driver.py b/stolichki/driver.py index 1f1b423..f19aa38 100644 --- a/stolichki/driver.py +++ b/stolichki/driver.py @@ -19,7 +19,8 @@ from webdriver_manager.chrome import ChromeDriverManager from twocaptcha import TwoCaptcha -from stolichki.errors import CaptchaError, ConfigError, LoadingError +from stolichki.errors import CaptchaError, LoadingError +from crawler.utils.exceptions import ConfigError class StolichkiDriver(uc.Chrome): def __init__(self, **kwargs): diff --git a/stolichki/errors.py b/stolichki/errors.py index d31bb0e..da8c1c6 100644 --- a/stolichki/errors.py +++ b/stolichki/errors.py @@ -3,9 +3,5 @@ class CaptchaError(Exception): super().__init__(*args) class LoadingError(Exception): - def __init__(self, *args: object) -> None: - super().__init__(*args) - -class ConfigError(Exception): def __init__(self, *args: object) -> None: super().__init__(*args) \ No newline at end of file diff --git a/stolichki/parsers/city.py b/stolichki/parsers/city.py index 150592e..a9eb3d7 100644 --- a/stolichki/parsers/city.py +++ b/stolichki/parsers/city.py @@ -1,12 +1,14 @@ from selenium.webdriver.common.by import By +from crawler.types import Proxy + from stolichki.driver import StolichkiDriver from stolichki.parsers.category import get_category_parser from stolichki.types.city import City class CityParser: - def __init__(self, city: City) -> None: + def __init__(self, city, proxies: list[Proxy]) -> None: self.driver = StolichkiDriver() self.city = city