diff --git a/crawler/__init__.py b/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/api.py b/crawler/api.py new file mode 100644 index 0000000..084a6da --- /dev/null +++ b/crawler/api.py @@ -0,0 +1,122 @@ +import asyncio +import configparser +import random +import logging + +import aiohttp +import backoff + +from utils.exceptions import ConfigError +from crawler.types import City, Proxy +from utils.classes import Singleton + +log = logging.getLogger(__name__) + +class CrawlerAPI(metaclass=Singleton): + api_baseurl = "https://q.asburo.ru/ch/" + api_version = "v1" + + cities = [] + proxies = [] + + def __init__(self) -> None: + log.info("Initializing crawler API class") + + self.api_url = self.api_baseurl + self.api_version + + config = configparser.ConfigParser() + + try: + config.read("config.ini") + log.debug("Reading config file") + + self.config = config["crawler"] + + self.rival_tag = self.config["tag"] + self.auth = aiohttp.BasicAuth( + self.config["username"], self.config["password"] + ) + + log.info("Successfully parsed config file.") + log.debug(f"Your auth: {self.auth}") + + except: + raise ConfigError("Can't read settings for crawler api. Check your config.ini.") + + timeout = aiohttp.ClientTimeout(12000) + self.session = aiohttp.ClientSession(timeout=timeout) + + log.info("Crawler API initialized") + + @backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_time=60) + async def get_cities(self) -> list[City]: + if len(self.cities) <= 0: + url = f"{self.api_url}/cities/{self.rival_tag}" + + response = await self.session.get(url, auth=self.auth) + if response.status >= 500: + raise aiohttp.ServerConnectionError() + + log.debug(f"Response status: {response.status} for {url}") + + if response.status == 200: + json_response = await response.json() + + self.cities = [City(**city) for city in json_response.get("city_list")] + + return self.cities + + @backoff.on_exception(backoff.expo, aiohttp.ClientError, max_time=60) + async def get_proxies(self) -> list[Proxy]: + if len(self.proxies) <= 0: + url = f"{self.api_url}/proxies/" + + response = await self.session.get(url, auth=self.auth) + log.debug(f"Response status: {response.status} for {url}") + + if response.status == 200: + json_response = await response.json() + + self.proxies = [Proxy(**proxy) for proxy in json_response.get("proxy_list")] + + return self.proxies + + async def get_random_proxy(self) -> Proxy: + proxies = await self.get_proxies() + + return proxies[random.randint(0, len(proxies) - 1)] + + async def remove_proxy(self, proxy: Proxy) -> Proxy: + self.proxies.remove(proxy) + + return await self.get_random_proxy() + + + @backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_tries=15, logger=log) + async def send_products(self, results: list): + log.info("Sending data") + + url = f"{self.api_url}/prices/{self.rival_tag}" + + data = { + "rows": results + } + + response = await self.session.post(url, json=data, auth=self.auth) + status, response_text = response.status, await response.text() + + log.info(f"{data} was sended. Status: {status}. Response: {response_text}") + + if status >= 500: + await asyncio.sleep(15) + raise aiohttp.ServerConnectionError(response_text) + + async def __aenter__(self): + return self + + async def __aexit__(self, *args, **kwargs): + await self.close() + + async def close(self): + if not self.session.closed: + await self.session.close() diff --git a/crawler/types.py b/crawler/types.py new file mode 100644 index 0000000..1622bd0 --- /dev/null +++ b/crawler/types.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass +from utils.classes import DataclassBase + +@dataclass(init=False) +class City(DataclassBase): + city: str + region_id: int | None + region_name: str | None + +@dataclass(init=False) +class Proxy(DataclassBase): + ip: str + port: str + login: str + password: str \ No newline at end of file diff --git a/crawler/utils/__init__.py b/crawler/utils/__init__.py new file mode 100644 index 0000000..158271e --- /dev/null +++ b/crawler/utils/__init__.py @@ -0,0 +1,4 @@ +def chunks(lst: list, n: int): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] \ No newline at end of file diff --git a/crawler/utils/asyncio.py b/crawler/utils/asyncio.py new file mode 100644 index 0000000..01c6d03 --- /dev/null +++ b/crawler/utils/asyncio.py @@ -0,0 +1,11 @@ +import asyncio + + +async def gather_with_concurrency(n, *coros): + semaphore = asyncio.Semaphore(int(n)) + + async def sem_coro(coro): + async with semaphore: + return await coro + + return await asyncio.gather(*(sem_coro(c) for c in coros)) \ No newline at end of file diff --git a/crawler/utils/classes.py b/crawler/utils/classes.py new file mode 100644 index 0000000..51b0029 --- /dev/null +++ b/crawler/utils/classes.py @@ -0,0 +1,28 @@ +import dataclasses +import json +from dataclasses import dataclass + + +@dataclass(init=False) +class DataclassBase: + def __init__(self, **kwargs): + names = set([f.name for f in dataclasses.fields(self)]) + for k, v in kwargs.items(): + if k in names: + setattr(self, k, v) + + +class Singleton(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class DataclassJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o): + return dataclasses.asdict(o) + return super().default(o) \ No newline at end of file diff --git a/crawler/utils/exceptions.py b/crawler/utils/exceptions.py new file mode 100644 index 0000000..6b270de --- /dev/null +++ b/crawler/utils/exceptions.py @@ -0,0 +1,3 @@ +class ConfigError(Exception): + def __init__(self, *args: object) -> None: + super().__init__(*args) \ No newline at end of file diff --git a/crawler/utils/formater b/crawler/utils/formater new file mode 100755 index 0000000..9f02779 --- /dev/null +++ b/crawler/utils/formater @@ -0,0 +1,19 @@ +#!/bin/python3 + +import sys + +# Мне надоело вручную форматить json ответы, поэтому написал +# Эту утилиту. Берёт всего один аргумент: путь до json файла + +file_path = sys.argv[1:][0] + +if file_path: + import json + + f = open(file_path) + data = json.load(f) + + f = open(file_path, "w") + json.dump(data, f, indent=2, ensure_ascii=False) + + f.close() \ No newline at end of file