python爬虫-下载高德地图区域(省,市,区)
python爬虫,用于下载:https://datav.aliyun.com/portal/school/atlas/area_selector 的中国地图及其下钻省市区的json文件。在echarts或者leaflet展示。
可能会少几个市区的full.json数据,api的xml调不通,可以手动去 https://datav.aliyun.com/portal/school/atlas/area_selector下载
-
下载并解析all.json文件。
-
遍历all.json中的JSON数组,获取每个adcode:
-
检查dist目录下是否已经存在adcode.json文件,如果不存在,则下载并保存。
-
如果adcode的最后两位不为00,检查dist目录下是否已经存在adcode_full.json文件,如果不存在,则下载并保存。(有些市最后两位可能为00,即没有下属区 则不理它,如东莞市)
-
如果有失败的下载,尝试重新下载失败的文件。
-
输出下载结果和跳过的文件列表。
-
文件压缩,去掉多余空格
目录
直接就能用
import os
import json
import requests
from tqdm import tqdm
# 下载文件的函数
def download_file(url, filepath):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
with open(filepath, 'w', encoding='utf-8') as file:
json.dump(response.json(), file, ensure_ascii=False, indent=4)
return True
except Exception as e:
print(f"Failed to download {os.path.basename(filepath)}: {e}")
return False
# 第一步:下载并解析 all.json
def first_step():
url = 'https://geo.datav.aliyun.com/areas_v3/bound/all.json'
print('Downloading all.json...')
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
print('Downloaded all.json successfully')
return response.json()
except Exception as e:
print(f"Failed to download all.json: {e}")
exit(1)
# 第二步:遍历 JSON 数组,下载每个 adcode 对应的 JSON
def second_step(json_array):
failed_downloads = []
skipped_downloads = []
# 确保 dist 目录存在
dist_path = os.path.join(os.getcwd(), 'dist')
os.makedirs(dist_path, exist_ok=True)
# 逐个下载
for item in tqdm(json_array, desc="Downloading files"):
adcode = str(item['adcode'])
is_special_code = adcode.endswith('00')
# 构建下载链接和文件名
normal_file_name = f"{adcode}.json"
full_file_name = f"{adcode}_full.json"
normal_file_path = os.path.join(dist_path, normal_file_name)
full_file_path = os.path.join(dist_path, full_file_name)
# 检查并下载普通文件
if not os.path.exists(normal_file_path):
url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}.json"
success = download_file(url, normal_file_path)
if not success:
failed_downloads.append(adcode)
else:
skipped_downloads.append(normal_file_name)
# 如果是特殊代码,检查并下载 _full 文件
if is_special_code:
if not os.path.exists(full_file_path):
url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}_full.json"
success = download_file(url, full_file_path)
if not success:
failed_downloads.append(adcode)
else:
skipped_downloads.append(full_file_name)
# 返回失败的下载列表和跳过的下载列表
return failed_downloads, skipped_downloads
# 主函数
def main():
json_array = first_step()
failed_downloads, skipped_downloads = second_step(json_array)
# 输出跳过的下载
if skipped_downloads:
print(f"Skipped downloads: {', '.join(skipped_downloads)}")
# 如果有失败的下载,尝试重新下载
if failed_downloads:
print(f"Retrying failed downloads: {', '.join(failed_downloads)}")
retry_result, _ = second_step([{'adcode': adcode} for adcode in failed_downloads])
if retry_result:
print(f"Failed downloads after retry: {', '.join(retry_result)}")
else:
print("All failed downloads were successful on retry")
else:
print("All downloads completed successfully")
if __name__ == "__main__":
main()
调用本地的all。json文件下载(这个文件是从https://geo.datav.aliyun.com/areas_v3/bound/all.json拷贝下来的)
import os
import json
import requests
import time
from tqdm import tqdm
# 下载文件的函数(带进度条)
def download_file_with_progress(url, filepath, delay=1):
try:
# 启用流模式
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()
# 获取文件大小
total_size = int(response.headers.get('content-length', 0))
chunk_size = 1024 # 每次读取 1KB
# 打开文件准备写入
with open(filepath, 'wb') as file:
with tqdm(total=total_size, unit='B', unit_scale=True, desc=os.path.basename(filepath)) as progress_bar:
for chunk in response.iter_content(chunk_size=chunk_size):
file.write(chunk)
progress_bar.update(len(chunk))
time.sleep(delay) # 增加下载间隔
return True
except Exception as e:
print(f"Failed to download {os.path.basename(filepath)}: {e}")
return False
# 第一步:读取本地的 all.json 文件
def first_step():
local_path = './all.json'
print('Reading all.json...')
try:
with open(local_path, 'r', encoding='utf-8') as file:
data = json.load(file)
print('Read all.json successfully')
return data
except Exception as e:
print(f"Failed to read all.json: {e}")
exit(1)
# 第二步:遍历 JSON 数组,下载每个 adcode 对应的 JSON
def second_step(json_array):
failed_downloads = []
skipped_downloads = []
# 确保 dist 目录存在
dist_path = os.path.join(os.getcwd(), 'dist')
os.makedirs(dist_path, exist_ok=True)
# 逐个下载
for item in json_array:
adcode = str(item['adcode'])
is_special_code = adcode.endswith('00')
# 构建下载链接和文件名
normal_file_name = f"{adcode}.json"
full_file_name = f"{adcode}_full.json"
normal_file_path = os.path.join(dist_path, normal_file_name)
full_file_path = os.path.join(dist_path, full_file_name)
# 检查并下载普通文件
if not os.path.exists(normal_file_path):
url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}.json"
success = download_file_with_progress(url, normal_file_path, delay=1)
if not success:
failed_downloads.append(adcode)
else:
print(f"Skipped: {normal_file_name}")
skipped_downloads.append(normal_file_name)
# 如果是特殊代码,检查并下载 _full 文件
if is_special_code:
if not os.path.exists(full_file_path):
url = f"https://geo.datav.aliyun.com/areas_v3/bound/{adcode}_full.json"
success = download_file_with_progress(url, full_file_path, delay=1)
if not success:
failed_downloads.append(adcode)
else:
print(f"Skipped: {full_file_name}")
skipped_downloads.append(full_file_name)
# 返回失败的下载列表和跳过的下载列表
return failed_downloads, skipped_downloads
# 主函数
def main():
json_array = first_step()
failed_downloads, skipped_downloads = second_step(json_array)
# 输出跳过的下载
if skipped_downloads:
print(f"Skipped downloads: {', '.join(skipped_downloads)}")
# 如果有失败的下载,尝试重新下载
if failed_downloads:
print(f"Retrying failed downloads: {', '.join(failed_downloads)}")
retry_result, _ = second_step([{'adcode': adcode} for adcode in failed_downloads])
if retry_result:
print(f"Failed downloads after retry: {', '.join(retry_result)}")
else:
print("All failed downloads were successful on retry")
else:
print("All downloads completed successfully")
if __name__ == "__main__":
main()
文件压缩一下,要么太占地方了
import os
import json
# 压缩 JSON 文件的函数
def compress_json_file(input_filepath, output_dir):
try:
# 读取 JSON 数据
with open(input_filepath, 'r', encoding='utf-8') as file:
data = json.load(file)
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 构建输出文件路径
filename = os.path.basename(input_filepath)
output_filepath = os.path.join(output_dir, filename)
# 写入压缩后的 JSON 数据
with open(output_filepath, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, separators=(',', ':'))
print(f"Compressed: {filename} -> {output_filepath}")
except Exception as e:
print(f"Failed to compress {input_filepath}: {e}")
# 遍历 dist 文件夹中的 JSON 文件并压缩
def compress_all_json_files(input_dir, output_dir):
if not os.path.exists(input_dir):
print(f"Input directory {input_dir} does not exist.")
return
# 遍历 dist 文件夹中的所有 JSON 文件
for root, _, files in os.walk(input_dir):
for file in files:
if file.endswith('.json'):
input_filepath = os.path.join(root, file)
compress_json_file(input_filepath, output_dir)
# 主函数
def main():
input_dir = './dist' # 原始 JSON 文件所在目录
output_dir = './compressed' # 压缩后文件保存目录
compress_all_json_files(input_dir, output_dir)
print(f"All JSON files from {input_dir} have been compressed into {output_dir}")
if __name__ == "__main__":
main()
原文地址:https://blog.csdn.net/Shi_haoliu/article/details/143889595
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!