This commit is contained in:
Анатолий Богомолов 2024-01-05 15:52:44 +10:00
parent b97e8fe739
commit fe16d4eee9
6 changed files with 52 additions and 18 deletions

View File

@ -1,16 +1,16 @@
import asyncio import asyncio
import configparser import configparser
import random import random
import logging
from loguru import logger
import aiohttp import aiohttp
import backoff import backoff
from utils.exceptions import ConfigError from crawler.utils.exceptions import ConfigError
from crawler.types import City, Proxy from crawler.types import City, Proxy
from utils.classes import Singleton from crawler.utils.classes import Singleton
log = logging.getLogger(__name__) log = logger
class CrawlerAPI(metaclass=Singleton): class CrawlerAPI(metaclass=Singleton):
api_baseurl = "https://q.asburo.ru/ch/" api_baseurl = "https://q.asburo.ru/ch/"
@ -94,7 +94,7 @@ class CrawlerAPI(metaclass=Singleton):
@backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_tries=15, logger=log) @backoff.on_exception(backoff.expo, (aiohttp.ClientError, aiohttp.ServerConnectionError), max_tries=15, logger=log)
async def send_products(self, results: list): async def send_products(self, results: list):
log.info("Sending data") log.info("Sending data...")
url = f"{self.api_url}/prices/{self.rival_tag}" url = f"{self.api_url}/prices/{self.rival_tag}"
@ -105,7 +105,7 @@ class CrawlerAPI(metaclass=Singleton):
response = await self.session.post(url, json=data, auth=self.auth) response = await self.session.post(url, json=data, auth=self.auth)
status, response_text = response.status, await response.text() status, response_text = response.status, await response.text()
log.info(f"{data} was sended. Status: {status}. Response: {response_text}") log.debug(f"{data} was sended. Status: {status}. Response: {response_text}")
if status >= 500: if status >= 500:
await asyncio.sleep(15) await asyncio.sleep(15)

View File

@ -1,11 +1,13 @@
from dataclasses import dataclass from dataclasses import dataclass
from utils.classes import DataclassBase from crawler.utils.classes import DataclassBase
@dataclass(init=False) @dataclass(init=False)
class City(DataclassBase): class City(DataclassBase):
id: int = 0
city: str city: str
region_id: int | None region_id: int | None
region_name: str | None region_name: str | None
is_byapt: 1 | 0
@dataclass(init=False) @dataclass(init=False)
class Proxy(DataclassBase): class Proxy(DataclassBase):

43
main.py
View File

@ -1,17 +1,50 @@
import sys import sys
import asyncio
import configparser
import json
import multiprocessing
from loguru import logger from loguru import logger
from stolichki.parsers.city import CityParser from stolichki.parsers.city import CityParser
from stolichki.types.city import City from stolichki.types.city import City
from crawler.api import CrawlerAPI
from crawler.utils.classes import DataclassJSONEncoder
from crawler.types import City, Proxy
async def get_crawler_data():
crawler = CrawlerAPI()
cities = await crawler.get_cities()
proxies = await crawler.get_proxies()
return cities, proxies
def filter_cities(config, cities: list[City]) -> list[City]:
with open(config["parser"]["cities_path"]) as f:
cities_stolichki = json.load(f)
for city in cities:
city.id = cities_stolichki[city.city]
return cities
@logger.catch @logger.catch
def main(): def main(cities: list[City], proxies: list[Proxy]):
city = City(111, "Бутово", 1, []) config = configparser.ConfigParser()
result = CityParser(city).parse() config.read("config.ini")
print(result)
cities = filter_cities(config, cities)
quantity = config['parser']['cities_quantity'] or len(cities)
with multiprocessing.Pool(processes=quantity) as pool: #type: ignore
results = pool.map(lambda city: CityParser(city, proxies).parse(), cities)
with open("results.json", "w") as f:
json.dump(results, f, cls=DataclassJSONEncoder, ensure_ascii=False, indent=4)
if __name__ == "__main__": if __name__ == "__main__":
logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore
main()
cities, proxies = asyncio.run(get_crawler_data())
main(cities, proxies)

View File

@ -19,7 +19,8 @@ from webdriver_manager.chrome import ChromeDriverManager
from twocaptcha import TwoCaptcha from twocaptcha import TwoCaptcha
from stolichki.errors import CaptchaError, ConfigError, LoadingError from stolichki.errors import CaptchaError, LoadingError
from crawler.utils.exceptions import ConfigError
class StolichkiDriver(uc.Chrome): class StolichkiDriver(uc.Chrome):
def __init__(self, **kwargs): def __init__(self, **kwargs):

View File

@ -3,9 +3,5 @@ class CaptchaError(Exception):
super().__init__(*args) super().__init__(*args)
class LoadingError(Exception): class LoadingError(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)
class ConfigError(Exception):
def __init__(self, *args: object) -> None: def __init__(self, *args: object) -> None:
super().__init__(*args) super().__init__(*args)

View File

@ -1,12 +1,14 @@
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from crawler.types import Proxy
from stolichki.driver import StolichkiDriver from stolichki.driver import StolichkiDriver
from stolichki.parsers.category import get_category_parser from stolichki.parsers.category import get_category_parser
from stolichki.types.city import City from stolichki.types.city import City
class CityParser: class CityParser:
def __init__(self, city: City) -> None: def __init__(self, city, proxies: list[Proxy]) -> None:
self.driver = StolichkiDriver() self.driver = StolichkiDriver()
self.city = city self.city = city