2023-12-26 01:32:27 +10:00
|
|
|
import sys
|
2024-01-05 15:52:44 +10:00
|
|
|
import asyncio
|
|
|
|
import configparser
|
|
|
|
import json
|
|
|
|
import multiprocessing
|
2023-12-12 16:13:58 +10:00
|
|
|
|
2023-12-26 01:32:27 +10:00
|
|
|
from loguru import logger
|
|
|
|
|
|
|
|
from stolichki.parsers.city import CityParser
|
|
|
|
from stolichki.types.city import City
|
2024-01-05 15:52:44 +10:00
|
|
|
from crawler.api import CrawlerAPI
|
|
|
|
from crawler.utils.classes import DataclassJSONEncoder
|
2023-12-26 01:32:27 +10:00
|
|
|
|
2024-01-05 15:52:44 +10:00
|
|
|
from crawler.types import City, Proxy
|
|
|
|
|
|
|
|
async def get_crawler_data():
|
|
|
|
crawler = CrawlerAPI()
|
|
|
|
cities = await crawler.get_cities()
|
|
|
|
proxies = await crawler.get_proxies()
|
2024-01-05 16:08:19 +10:00
|
|
|
await crawler.close()
|
2024-01-05 15:52:44 +10:00
|
|
|
return cities, proxies
|
|
|
|
|
|
|
|
def filter_cities(config, cities: list[City]) -> list[City]:
|
|
|
|
|
|
|
|
with open(config["parser"]["cities_path"]) as f:
|
|
|
|
cities_stolichki = json.load(f)
|
|
|
|
|
|
|
|
for city in cities:
|
|
|
|
city.id = cities_stolichki[city.city]
|
|
|
|
|
|
|
|
return cities
|
2023-12-26 01:32:27 +10:00
|
|
|
|
2024-01-05 16:08:19 +10:00
|
|
|
def parse_city(city: City):
|
|
|
|
CityParser(city, proxies).parse()
|
|
|
|
|
2023-12-26 01:32:27 +10:00
|
|
|
@logger.catch
|
2024-01-05 16:08:19 +10:00
|
|
|
def main(cities: list[City]):
|
2024-01-05 15:52:44 +10:00
|
|
|
config = configparser.ConfigParser()
|
|
|
|
config.read("config.ini")
|
|
|
|
|
|
|
|
cities = filter_cities(config, cities)
|
|
|
|
quantity = config['parser']['cities_quantity'] or len(cities)
|
|
|
|
|
2024-01-05 16:08:19 +10:00
|
|
|
with multiprocessing.Pool(processes=int(quantity)) as pool: #type: ignore
|
|
|
|
results = pool.map(parse_city, cities)
|
2024-01-05 15:52:44 +10:00
|
|
|
|
|
|
|
with open("results.json", "w") as f:
|
|
|
|
json.dump(results, f, cls=DataclassJSONEncoder, ensure_ascii=False, indent=4)
|
2023-12-12 16:13:58 +10:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-12-26 01:32:27 +10:00
|
|
|
logger.add(sys.stderr, level="DEBUG", backtrace=True, enqueue=True) #type: ignore
|
2024-01-05 15:52:44 +10:00
|
|
|
|
|
|
|
cities, proxies = asyncio.run(get_crawler_data())
|
2024-01-05 16:08:19 +10:00
|
|
|
main(cities)
|