Added crawler api library

This commit is contained in:
Анатолий Богомолов 2024-01-03 14:34:15 +10:00
parent 71825351c8
commit 049e436d37
8 changed files with 202 additions and 0 deletions

0
crawler/__init__.py Normal file
View File

122
crawler/api.py Normal file
View File

@ -0,0 +1,122 @@
import asyncio
import configparser
import random
import logging
import aiohttp
import backoff
from utils.exceptions import ConfigError
from crawler.types import City, Proxy
from utils.classes import Singleton
log = logging.getLogger(__name__)
class CrawlerAPI(metaclass=Singleton):
api_baseurl = "https://q.asburo.ru/ch/"
api_version = "v1"
cities = []
proxies = []
def __init__(self) -> None:
log.info("Initializing crawler API class")
self.api_url = self.api_baseurl + self.api_version
config = configparser.ConfigParser()
try:
config.read("config.ini")
log.debug("Reading config file")
self.config = config["crawler"]
self.rival_tag = self.config["tag"]
self.auth = aiohttp.BasicAuth(
self.config["username"], self.config["password"]
)
log.info("Successfully parsed config file.")
log.debug(f"Your auth: {self.auth}")
except:
raise ConfigError("Can't read settings for crawler api. Check your config.ini.")
timeout = aiohttp.ClientTimeout(12000)
self.session = aiohttp.ClientSession(timeout=timeout)
log.info("Crawler API initialized")
@backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_time=60)
async def get_cities(self) -> list[City]:
if len(self.cities) <= 0:
url = f"{self.api_url}/cities/{self.rival_tag}"
response = await self.session.get(url, auth=self.auth)
if response.status >= 500:
raise aiohttp.ServerConnectionError()
log.debug(f"Response status: {response.status} for {url}")
if response.status == 200:
json_response = await response.json()
self.cities = [City(**city) for city in json_response.get("city_list")]
return self.cities
@backoff.on_exception(backoff.expo, aiohttp.ClientError, max_time=60)
async def get_proxies(self) -> list[Proxy]:
if len(self.proxies) <= 0:
url = f"{self.api_url}/proxies/"
response = await self.session.get(url, auth=self.auth)
log.debug(f"Response status: {response.status} for {url}")
if response.status == 200:
json_response = await response.json()
self.proxies = [Proxy(**proxy) for proxy in json_response.get("proxy_list")]
return self.proxies
async def get_random_proxy(self) -> Proxy:
proxies = await self.get_proxies()
return proxies[random.randint(0, len(proxies) - 1)]
async def remove_proxy(self, proxy: Proxy) -> Proxy:
self.proxies.remove(proxy)
return await self.get_random_proxy()
@backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_tries=15, logger=log)
async def send_products(self, results: list):
log.info("Sending data")
url = f"{self.api_url}/prices/{self.rival_tag}"
data = {
"rows": results
}
response = await self.session.post(url, json=data, auth=self.auth)
status, response_text = response.status, await response.text()
log.info(f"{data} was sended. Status: {status}. Response: {response_text}")
if status >= 500:
await asyncio.sleep(15)
raise aiohttp.ServerConnectionError(response_text)
async def __aenter__(self):
return self
async def __aexit__(self, *args, **kwargs):
await self.close()
async def close(self):
if not self.session.closed:
await self.session.close()

15
crawler/types.py Normal file
View File

@ -0,0 +1,15 @@
from dataclasses import dataclass
from utils.classes import DataclassBase
@dataclass(init=False)
class City(DataclassBase):
city: str
region_id: int | None
region_name: str | None
@dataclass(init=False)
class Proxy(DataclassBase):
ip: str
port: str
login: str
password: str

View File

@ -0,0 +1,4 @@
def chunks(lst: list, n: int):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]

11
crawler/utils/asyncio.py Normal file
View File

@ -0,0 +1,11 @@
import asyncio
async def gather_with_concurrency(n, *coros):
semaphore = asyncio.Semaphore(int(n))
async def sem_coro(coro):
async with semaphore:
return await coro
return await asyncio.gather(*(sem_coro(c) for c in coros))

28
crawler/utils/classes.py Normal file
View File

@ -0,0 +1,28 @@
import dataclasses
import json
from dataclasses import dataclass
@dataclass(init=False)
class DataclassBase:
def __init__(self, **kwargs):
names = set([f.name for f in dataclasses.fields(self)])
for k, v in kwargs.items():
if k in names:
setattr(self, k, v)
class Singleton(type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
class DataclassJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
return super().default(o)

View File

@ -0,0 +1,3 @@
class ConfigError(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

19
crawler/utils/formater Executable file
View File

@ -0,0 +1,19 @@
#!/bin/python3
import sys
# Мне надоело вручную форматить json ответы, поэтому написал
# Эту утилиту. Берёт всего один аргумент: путь до json файла
file_path = sys.argv[1:][0]
if file_path:
import json
f = open(file_path)
data = json.load(f)
f = open(file_path, "w")
json.dump(data, f, indent=2, ensure_ascii=False)
f.close()