stolichki/main.py

54 lines
1.5 KiB
Python
Raw Permalink Normal View History

2023-12-26 01:32:27 +10:00
import sys
2024-01-05 15:52:44 +10:00
import asyncio
import configparser
import json
import multiprocessing
2023-12-12 16:13:58 +10:00
2023-12-26 01:32:27 +10:00
from loguru import logger
from stolichki.parsers.city import CityParser
from stolichki.types.city import City
2024-01-05 15:52:44 +10:00
from crawler.api import CrawlerAPI
from crawler.utils.classes import DataclassJSONEncoder
2023-12-26 01:32:27 +10:00
2024-01-05 15:52:44 +10:00
from crawler.types import City, Proxy
async def get_crawler_data():
crawler = CrawlerAPI()
cities = await crawler.get_cities()
proxies = await crawler.get_proxies()
2024-01-05 16:08:19 +10:00
await crawler.close()
2024-01-05 15:52:44 +10:00
return cities, proxies
def filter_cities(config, cities: list[City]) -> list[City]:
with open(config["parser"]["cities_path"]) as f:
cities_stolichki = json.load(f)
for city in cities:
city.id = cities_stolichki[city.city]
return cities
2023-12-26 01:32:27 +10:00
2024-01-05 16:08:19 +10:00
def parse_city(city: City):
CityParser(city, proxies).parse()
2023-12-26 01:32:27 +10:00
@logger.catch
2024-01-05 16:08:19 +10:00
def main(cities: list[City]):
2024-01-05 15:52:44 +10:00
config = configparser.ConfigParser()
config.read("config.ini")
cities = filter_cities(config, cities)
quantity = config['parser']['cities_quantity'] or len(cities)
2024-01-05 16:08:19 +10:00
with multiprocessing.Pool(processes=int(quantity)) as pool: #type: ignore
results = pool.map(parse_city, cities)
2024-01-05 15:52:44 +10:00
with open("results.json", "w") as f:
json.dump(results, f, cls=DataclassJSONEncoder, ensure_ascii=False, indent=4)
2023-12-12 16:13:58 +10:00
if __name__ == "__main__":
2023-12-26 01:32:27 +10:00
logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore
2024-01-05 15:52:44 +10:00
cities, proxies = asyncio.run(get_crawler_data())
2024-01-05 16:08:19 +10:00
main(cities)