python爬虫-下载高德地图区域（省，市，区）

🕗 发布于 2024-11-20 21:39 python 爬虫 开发语言 前端 json

python爬虫，用于下载：https://datav.aliyun.com/portal/school/atlas/area_selector 的中国地图及其下钻省市区的json文件。在echarts或者leaflet展示。
可能会少几个市区的full.json数据，api的xml调不通，可以手动去 https://datav.aliyun.com/portal/school/atlas/area_selector下载
在这里插入图片描述

下载并解析all.json文件。
遍历all.json中的JSON数组，获取每个adcode：
检查dist目录下是否已经存在adcode.json文件，如果不存在，则下载并保存。
如果adcode的最后两位不为00，检查dist目录下是否已经存在adcode_full.json文件，如果不存在，则下载并保存。(有些市最后两位可能为00，即没有下属区则不理它，如东莞市)
如果有失败的下载，尝试重新下载失败的文件。
输出下载结果和跳过的文件列表。
文件压缩，去掉多余空格

当前文章python代码地址

原项目github代码（node版）

直接就能用

import os
import json
import requests
from tqdm import tqdm

# 下载文件的函数
def download_file(url, filepath):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(filepath, 'w', encoding='utf-8') as file:
            json.dump(response.json(), file, ensure_ascii=False, indent=4)
        return True
    except Exception as e:
        print(f"Failed to download {os.path.basename(filepath)}: {e}")
        return False

# 第一步：下载并解析 all.json
def first_step():
    url = 'https://geo.datav.aliyun.com/areas_v3/bound/all.json'
    print('Downloading all.json...')
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        print('Downloaded all.json successfully')
        return response.json()
    except Exception as e:
        print(f"Failed to download all.json: {e}")
        exit(1)

# 第二步：遍历 JSON 数组，下载每个 adcode 对应的 JSON
def second_step(json_array):
    failed_downloads = []
    skipped_downloads = []

    # 确保 dist 目录存在
    dist_path = os.path.join(os.getcwd(), 'dist')
    os.makedirs(dist_path, exist_ok=True)

    # 逐个下载
    for item in tqdm(json_array, desc="Downloading files"):
        adcode = str(item['adcode'])
        is_special_code = adcode.endswith('00')

        # 构建下载链接和文件名
        normal_file_name = f"{adcode}.json"
        full_file_name = f"{adcode}_full.json"
        normal_file_path = os.path.join(dist_path, normal_file_name)
        full_file_path = os.path.join(dist_path, full_file_name)

        # 检查并下载普通文件
        if not os.path.exists(normal_file_path):
            url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}.json"
            success = download_file(url, normal_file_path)
            if not success:
                failed_downloads.append(adcode)
        else:
            skipped_downloads.append(normal_file_name)

        # 如果是特殊代码，检查并下载 _full 文件
        if is_special_code:
            if not os.path.exists(full_file_path):
                url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}_full.json"
                success = download_file(url, full_file_path)
                if not success:
                    failed_downloads.append(adcode)
            else:
                skipped_downloads.append(full_file_name)

    # 返回失败的下载列表和跳过的下载列表
    return failed_downloads, skipped_downloads

# 主函数
def main():
    json_array = first_step()
    failed_downloads, skipped_downloads = second_step(json_array)

    # 输出跳过的下载
    if skipped_downloads:
        print(f"Skipped downloads: {', '.join(skipped_downloads)}")

    # 如果有失败的下载，尝试重新下载
    if failed_downloads:
        print(f"Retrying failed downloads: {', '.join(failed_downloads)}")
        retry_result, _ = second_step([{'adcode': adcode} for adcode in failed_downloads])
        if retry_result:
            print(f"Failed downloads after retry: {', '.join(retry_result)}")
        else:
            print("All failed downloads were successful on retry")
    else:
        print("All downloads completed successfully")

if __name__ == "__main__":
    main()

调用本地的all。json文件下载（这个文件是从https://geo.datav.aliyun.com/areas_v3/bound/all.json拷贝下来的）

import os
import json
import requests
import time
from tqdm import tqdm


# 下载文件的函数（带进度条）
def download_file_with_progress(url, filepath, delay=1):
    try:
        # 启用流模式
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()

        # 获取文件大小
        total_size = int(response.headers.get('content-length', 0))
        chunk_size = 1024  # 每次读取 1KB

        # 打开文件准备写入
        with open(filepath, 'wb') as file:
            with tqdm(total=total_size, unit='B', unit_scale=True, desc=os.path.basename(filepath)) as progress_bar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    file.write(chunk)
                    progress_bar.update(len(chunk))

        time.sleep(delay)  # 增加下载间隔
        return True
    except Exception as e:
        print(f"Failed to download {os.path.basename(filepath)}: {e}")
        return False


# 第一步：读取本地的 all.json 文件
def first_step():
    local_path = './all.json'
    print('Reading all.json...')
    try:
        with open(local_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        print('Read all.json successfully')
        return data
    except Exception as e:
        print(f"Failed to read all.json: {e}")
        exit(1)


# 第二步：遍历 JSON 数组，下载每个 adcode 对应的 JSON
def second_step(json_array):
    failed_downloads = []
    skipped_downloads = []

    # 确保 dist 目录存在
    dist_path = os.path.join(os.getcwd(), 'dist')
    os.makedirs(dist_path, exist_ok=True)

    # 逐个下载
    for item in json_array:
        adcode = str(item['adcode'])
        is_special_code = adcode.endswith('00')

        # 构建下载链接和文件名
        normal_file_name = f"{adcode}.json"
        full_file_name = f"{adcode}_full.json"
        normal_file_path = os.path.join(dist_path, normal_file_name)
        full_file_path = os.path.join(dist_path, full_file_name)

        # 检查并下载普通文件
        if not os.path.exists(normal_file_path):
            url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}.json"
            success = download_file_with_progress(url, normal_file_path, delay=1)
            if not success:
                failed_downloads.append(adcode)
        else:
            print(f"Skipped: {normal_file_name}")
            skipped_downloads.append(normal_file_name)

        # 如果是特殊代码，检查并下载 _full 文件
        if is_special_code:
            if not os.path.exists(full_file_path):
                url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}_full.json"
                success = download_file_with_progress(url, full_file_path, delay=1)
                if not success:
                    failed_downloads.append(adcode)
            else:
                print(f"Skipped: {full_file_name}")
                skipped_downloads.append(full_file_name)

    # 返回失败的下载列表和跳过的下载列表
    return failed_downloads, skipped_downloads


# 主函数
def main():
    json_array = first_step()
    failed_downloads, skipped_downloads = second_step(json_array)

    # 输出跳过的下载
    if skipped_downloads:
        print(f"Skipped downloads: {', '.join(skipped_downloads)}")

    # 如果有失败的下载，尝试重新下载
    if failed_downloads:
        print(f"Retrying failed downloads: {', '.join(failed_downloads)}")
        retry_result, _ = second_step([{'adcode': adcode} for adcode in failed_downloads])
        if retry_result:
            print(f"Failed downloads after retry: {', '.join(retry_result)}")
        else:
            print("All failed downloads were successful on retry")
    else:
        print("All downloads completed successfully")


if __name__ == "__main__":
    main()

文件压缩一下，要么太占地方了

import os
import json

# 压缩 JSON 文件的函数
def compress_json_file(input_filepath, output_dir):
    try:
        # 读取 JSON 数据
        with open(input_filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)

        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)

        # 构建输出文件路径
        filename = os.path.basename(input_filepath)
        output_filepath = os.path.join(output_dir, filename)

        # 写入压缩后的 JSON 数据
        with open(output_filepath, 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, separators=(',', ':'))
        print(f"Compressed: {filename} -> {output_filepath}")
    except Exception as e:
        print(f"Failed to compress {input_filepath}: {e}")

# 遍历 dist 文件夹中的 JSON 文件并压缩
def compress_all_json_files(input_dir, output_dir):
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist.")
        return

    # 遍历 dist 文件夹中的所有 JSON 文件
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.json'):
                input_filepath = os.path.join(root, file)
                compress_json_file(input_filepath, output_dir)

# 主函数
def main():
    input_dir = './dist'  # 原始 JSON 文件所在目录
    output_dir = './compressed'  # 压缩后文件保存目录
    compress_all_json_files(input_dir, output_dir)
    print(f"All JSON files from {input_dir} have been compressed into {output_dir}")

if __name__ == "__main__":
    main()

原文地址：https://blog.csdn.net/Shi_haoliu/article/details/143889595

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：STM32（hal库）中，为什么DMA没有MSP函数？
下一篇：【Linux】用户和用户组管理

论文阅读——Intrusion detection systems using longshort‑term memory (LSTM)
作者提出的 LSTM 模型能够有效区分正常网络流量和攻击流量。除此之外，模型结合主成分分析（PCA）和互信息作为降维方法。实验结果表明，基于 PCA 的模型（特别是使用2个主成分）在二分类和多分类任务
阅读更多2024-11-21
用源码编译虚幻引擎，并打包到安卓平台
本文详细介绍了如何用源码编译虚幻引擎，并将其打包到安卓平台。
阅读更多2024-11-21
Vue项目开发 element-UI 前端实现 1到10排列选择的按钮
在 Element UI 中，你可以通过来实现按钮的排列选择，例如让用户选择 1 到 10 之间的数字。为了实现这一功能，我们可以使用来动态生成 1 到 10 的按钮，并通过按钮点击事件来更新
阅读更多2024-11-21
Java EE 【知识改变命运】01计算机的一些知识点
计算机一些基础知识
阅读更多2024-11-21
04 —— Webpack打包CSS代码
加载器style-loader：把解析后的css代码插入到DOM。加载器css-loader ：解析css代码。直接引用，不用变量接收。
阅读更多2024-11-21
万能程序补丁工具 C# 源代码详解
万能程序补丁工具程序目的：搜索二进制可执行 EXE 或 DLL 文件分析的特征代码，替换特征代码，达到调试修正目标程序的功能。
阅读更多2024-11-21
【MySQL数据库】C#实现MySQL数据库最简单的查询和执行函数
C#和MySQL数据库是常见的数据交互，标准的查询和执行方法如下，做个记录。
阅读更多2024-11-21
单条推理转批量推理prompt
在每个线程中设置环境变量 CUDA_VISIBLE_DEVICES，以确保每个线程只使用指定的GPU。使用 concurrent.futures.ThreadPoolExecutor 来管理多线程任务
阅读更多2024-11-21
【AIGC】ChatGPT提示词Prompt解析：情感分析，分手后还可以做朋友吗？
【AIGC】在情感博弈中，最重要的是保持清醒的认知和优雅的态度。识别控制话术不是为了对抗，而是为了更好地保护自己的情感自由，实现真正的成长。
阅读更多2024-11-21
pycharm中配置pyqt5
PyQt和wxPython则提供了更多的控件和更强大的功能，适合于需要复杂用户界面的应用程序。pyQt生成的应用程序，引用图片通常是将资源文件装换为 python 文件，然后引用资源文件，而不能直接加
阅读更多2024-11-21

python爬虫-下载高德地图区域（省，市，区）

目录

直接就能用

调用本地的all。json文件下载（这个文件是从https://geo.datav.aliyun.com/areas_v3/bound/all.json拷贝下来的）

文件压缩一下，要么太占地方了

相关文章