stolichki/main.py

54 lines
1.5 KiB
Python

import sys
import asyncio
import configparser
import json
import multiprocessing
from loguru import logger
from stolichki.parsers.city import CityParser
from stolichki.types.city import City
from crawler.api import CrawlerAPI
from crawler.utils.classes import DataclassJSONEncoder
from crawler.types import City, Proxy
async def get_crawler_data():
crawler = CrawlerAPI()
cities = await crawler.get_cities()
proxies = await crawler.get_proxies()
await crawler.close()
return cities, proxies
def filter_cities(config, cities: list[City]) -> list[City]:
with open(config["parser"]["cities_path"]) as f:
cities_stolichki = json.load(f)
for city in cities:
city.id = cities_stolichki[city.city]
return cities
def parse_city(city: City):
CityParser(city, proxies).parse()
@logger.catch
def main(cities: list[City]):
config = configparser.ConfigParser()
config.read("config.ini")
cities = filter_cities(config, cities)
quantity = config['parser']['cities_quantity'] or len(cities)
with multiprocessing.Pool(processes=int(quantity)) as pool: #type: ignore
results = pool.map(parse_city, cities)
with open("results.json", "w") as f:
json.dump(results, f, cls=DataclassJSONEncoder, ensure_ascii=False, indent=4)
if __name__ == "__main__":
logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore
cities, proxies = asyncio.run(get_crawler_data())
main(cities)