项目描述
上传时间
浏览人数
本案例要求在控制台输出实时保存图片数(以汉字形式展现)以及耗时,因此使用了本文中的Class用于阿拉伯数字与汉字之间的转化,链接:
https://blog.csdn.net/vivian_ll/article/details/95172583
案例预期图:
代码部分:
# -*- coding:utf-8 -*- # author:Administrator # datetime:2021/7/6 10:13 # software:PyCharm import requests from requests.exceptions import HTTPError, ConnectionError, ReadTimeout import random import urllib import json import datetime from json import JSONDecodeError # 此class用于将阿拉伯数字转换为汉字 class NumberToHanzi(): def __init__(self): self.han_list = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"] self.unit_list = ["十", "百", "千"] ''' 把一个四位的数字字符串变成汉字字符串 num_str 需要被转换的四位的数字字符串 返回四位的数字字符串被转换成汉字字符串 ''' def four_to_hanstr(self, num_str): result = "" num_len = len(num_str) for i in range(num_len): num = int(num_str[i]) if i != num_len - 1 and num != 0: result += self.han_list[num] + self.unit_list[num_len - 2 - i] else: if num == 0 and result and result[-1] == '零': continue else: result += self.han_list[num] return result def dig2cn(self, num_str): str_len = len(num_str) if str_len > 12: print('数字太大,翻译不了') return # 如果大于8位,包含单位亿 elif str_len > 8: hanstr = self.four_to_hanstr(num_str[:-8]) + "亿" + \ self.four_to_hanstr(num_str[-8: -4]) + "万" + \ self.four_to_hanstr(num_str[-4:]) # 如果大于4位,包含单位万 elif str_len > 4: hanstr = self.four_to_hanstr(num_str[:-4]) + "万" + \ self.four_to_hanstr(num_str[-4:]) else: hanstr = self.four_to_hanstr(num_str) if hanstr[-1] == '零': hanstr = hanstr[:-1] return hanstr # 此函数为user_agent的headers def random_user_agent(): all_agent = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64;" " Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729;" " .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, " "like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident" "/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident" "/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)" ] return random.choice(all_agent) # 此函数用于获取地址,需要输入关键字以及页数两项参数 def get_url(keywords, page): url = [] for i in range(page): # 目标网站一页为30张图,链接中包含pn值,第一页为30,第二页为60,以此类推 pn = i * 30 # 链接中不包含中文,本语句用于将输入的汉字转码 code_name = urllib.parse.quote(keywords, safe='/', encoding=None, errors=None) str_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&" \ "logid=5665116163790274110&ipn=rj&ct=201326592&is=&fp=result&" \ "queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&" \ "ic=&hd=&latest=©right=&word=%s&s=&se=&tab=&width=&height=" \ "&face=&istype=&qc=&nc=1&fr=&expermode=&nojc=&pn=%d&rn=30&gsm=" \ "1e&1625626613025=" % (code_name, code_name, pn) url.append(str_url) return url # 本函数用于获取并保存图片,参数为链接的列表 def get_imgs(url_list): time_sta_1 = get_time() display = "" try: for m in range(len(url_list)): url = url_list[m] # 用request获取url的页面信息 reqs = {'User-Agent': random_user_agent()} reqs = requests.get(url, headers=reqs, timeout=10) # 返回status若不是200则会进入except reqs.raise_for_status() reqs.encoding = reqs.apparent_encoding # 用json.loads把页面信息转化为dict,并获取其中的"data"的值 try: j = json.loads(reqs.text, strict=False) data = j.get("data", []) for count in range(len(data)): all_ele = data[count] # thumbURL中为目标图片地址 pic = all_ele.get("thumbURL") # 判断all_ele中是否存在None值 if not pic: continue # 当前图片数的汉字转化 total_num = (m * 30) + count + 1 nth = NumberToHanzi() ans = nth.dig2cn(str(total_num)) new_name = total_num # 目标文件的保存路径,并为文件按其序号重命名 path = "../imgs/baidu/pra" save_path = path + str(new_name) + ".jpg" # 获取图片路径的信息,并保存图片 res = requests.get(pic) save = open(save_path, "wb") save.write(res.content) time_sta_2 = get_time() time3 = (time_sta_2 - time_sta_1) display = "获取第" + ans + "张图片(耗时:" + str('%.2f' % time3) + "s~)" print(display) save.close() except JSONDecodeError as f: print(f) return display except(HTTPError, ConnectionError, ReadTimeout)as e: print(e) return None # 获取当前时间,并转成时间戳 def get_time(): time = datetime.datetime.now() time_stamp = time.timestamp() return time_stamp if __name__ == '__main__': kw = input("需要输入想爬的关键字:") str_page = input("需要输入爬的页数(1页=30张):") page = int(str_page) a = get_url(kw, page) b = get_imgs(a)
成品效果gif: