stolichki/main.py

50 lines
1.4 KiB
Python
Raw Normal View History

2023-12-26 01:32:27 +10:00
import sys
2024-01-05 15:52:44 +10:00
import asyncio
import configparser
import json
import multiprocessing
2023-12-12 16:13:58 +10:00
2023-12-26 01:32:27 +10:00
from loguru import logger
from stolichki.parsers.city import CityParser
from stolichki.types.city import City
2024-01-05 15:52:44 +10:00
from crawler.api import CrawlerAPI
from crawler.utils.classes import DataclassJSONEncoder
2023-12-26 01:32:27 +10:00
2024-01-05 15:52:44 +10:00
from crawler.types import City, Proxy
async def get_crawler_data():
crawler = CrawlerAPI()
cities = await crawler.get_cities()
proxies = await crawler.get_proxies()
return cities, proxies
def filter_cities(config, cities: list[City]) -> list[City]:
with open(config["parser"]["cities_path"]) as f:
cities_stolichki = json.load(f)
for city in cities:
city.id = cities_stolichki[city.city]
return cities
2023-12-26 01:32:27 +10:00
@logger.catch
2024-01-05 15:52:44 +10:00
def main(cities: list[City], proxies: list[Proxy]):
config = configparser.ConfigParser()
config.read("config.ini")
cities = filter_cities(config, cities)
quantity = config['parser']['cities_quantity'] or len(cities)
with multiprocessing.Pool(processes=quantity) as pool: #type: ignore
results = pool.map(lambda city: CityParser(city, proxies).parse(), cities)
with open("results.json", "w") as f:
json.dump(results, f, cls=DataclassJSONEncoder, ensure_ascii=False, indent=4)
2023-12-12 16:13:58 +10:00
if __name__ == "__main__":
2023-12-26 01:32:27 +10:00
logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore
2024-01-05 15:52:44 +10:00
cities, proxies = asyncio.run(get_crawler_data())
main(cities, proxies)