Python3 半自动使用 ChromeDriver 采集豆瓣图片 并提交到 Chevereto 图床

前面我们写了Python 爬虫 抓取豆瓣小组图片 通过api提交入库到 Chevereto 图床,但是现在有些群组需要登录才行,所以我们这里做了一个半自动脚本并使用chromedriver自动登录豆瓣并提交图片到 Chevereto 图床。

注:老季是Python新手,且本脚本大部分内容来自网络。

chormedriver下载地址

http://npm.taobao.org/mirrors/chromedriver/

Python代码实例

#!/usr/bin/python # -*- coding: UTF-8 -*- # 作者:老季 # 网址:https://www.laoji.org  #导入所需的库 import logging import random import time import urllib.request import requests , json from selenium import webdriver  # 实例化driver # option= webdriver.ChromeOptions() # driver = webdriver.Chrome(chrome_options=option) driver = webdriver.Chrome('D:\Downloads\chromedriver.exe') # chromedirver所在路径 # 登陆 driver.get('https://accounts.douban.com/passport/login') driver.find_element_by_class_name('account-tab-account').click()  # 模拟点击 time.sleep(2) driver.find_element_by_id('username').send_keys('豆瓣用户名')  # 输入用户名 driver.find_element_by_id('password').send_keys('豆瓣密码')  # 输入密码 driver.find_element_by_css_selector("[class='btn btn-account btn-active']").click()  # 模拟点击 time.sleep(5) ''' TODO:验证码 ''' # 获取cookie session = requests.Session() cookies = driver.get_cookies() for cookie in cookies:     session.cookies.set(cookie['name'], cookie['value']) # print(cookies) time.sleep(3)  logging.FileHandler(filename='Selenium_img_douban.log', encoding='utf-8') logging.basicConfig(level=logging.INFO,                     format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',                     datefmt='%Y-%m-%d %A %H:%M:%S',                     # filename='F:\豆瓣下载\Selenium_img_douban.log', # 日志保存路径                     filemode='w')  # 模拟浏览器提交图片 def submit_img(url):     submit_url = 'http://tutie.org/api/1/upload/?key='+urllib.parse.quote_plus('f14fe8e8b11c08902a489706d7d99941')+'&source='+urllib.parse.quote_plus(url) #    print(submit_url)     req = urllib.request.Request(submit_url)     try:         res = urllib.request.urlopen(req)         data = json.loads(res.read().decode())         return str(data['status_code'])             except urllib.error.HTTPError as e:         print(e.read())          def get_headers():     '''     随机获取一个headers     '''     user_agents = ['Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',                    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',                    'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11']     headers = {'User-Agent': random.choice(user_agents)}     return headers   def get(url,group_url, start_page, end_page, params=None):     '''     向一个url发送get请求,返回response对象     :param url: 网页链接     :param params: URL参数字典     :return: 发送请求后获取的response对象     '''     # 等待一个随机的时间,防止被封IP,这里随机等待0~6秒,亲测可有效地避免触发豆瓣的反爬虫机制     time.sleep(6 * random.random())     resp = session.get(url, params=params, headers=get_headers())     print(resp)     logging.info(resp)     if resp:         logging.info(             '[get] url = {0}, status_code = {1}'.format(url, resp.status_code))         resp.encoding = 'utf-8'         # 这里很重要,每次发送请求后,都更新session的cookie,防止cookie过期         if resp.cookies.get_dict():             session.cookies.update(resp.cookies)             logging.info('[get] updated cookies, new cookies = {0}'.format(                 resp.cookies.get_dict()))     else:         logging.info('[get] url = {0}, response is None'.format(url))     logging.info('[info] 寻找小组列表地址...')     logging.info('[get] 已进入小组首页')     while start_page <= end_page:         driver.get(group_url + str((start_page - 1) * 25)) # 晒晒你最性感的照片小组         logging.info('已进入第' + str(start_page) + '页')         print('已进入第' + str(start_page) + '页')         # 每一页讨论链接列表         links = driver.find_elements_by_xpath(r"//tbody/tr/td[@class='title']/a")         topic_links = {}         for link in links:             topic_link = link.get_attribute('href')             title_name = link.get_attribute('title')             # 获取的图片以小组话题名称命名             img_name = title_name.replace('?', '?').replace('\', '').replace('/', '').replace('"', '').replace(':','').replace('*','').replace('<', '').replace('>', '').replace('|', '').replace('n', '')             topic_links[topic_link] = img_name             topic_num = 0         for topic_link, img_name in topic_links.items():             logging.info('进入第' + str(start_page) + '页,第' + str(topic_num + 1) + '话题。')             topic_num += 1             driver.get(topic_link)             # 判断该话题下有无图片             img_links = driver.find_elements_by_class_name('image-wrapper')             if len(img_links) > 0:                 i = 0                 for img_link in img_links:                     img_name = topic_links[topic_link] + '_' + str(i)                     img_src = img_link.find_element_by_xpath('./img').get_attribute('src')                     print('[get] 话题:' + img_name + ', 图片链接:' + img_src)                     img = img_src.replace('.webp','.jpg')                     print('提交图片链接:'+img)                     submit_img( img )                     # download_img = urllib.request.urlretrieve(img_src, 'F:\豆瓣下载\%s.jpg' % img_name)                     # time.sleep(3 * random.random())                     i += 1             else:                 print('该标题下无图片')             img_links2 = driver.find_elements_by_css_selector("[class='topic-figure cc']")             time.sleep(3 * random.random())         start_page += 1         time.sleep(3 * random.random())     time.sleep(5)     driver.quit()   if __name__ == '__main__':     home_url = 'https://www.douban.com/'     group_url = 'https://www.douban.com/group/481977/discussion?start='     get(home_url , group_url, 0, 10000) 

腾讯云限时秒杀【点击购买】

搬瓦工,CN2高速线路,1GB带宽,电信联通优化KVM,延迟低,速度快,建站稳定,搬瓦工BandwagonHost VPS优惠码BWH26FXH3HIQ,支持<支付宝> 【点击购买】!

Vultr$3.5日本节点,512M内存/500G流量/1G带宽,电信联通优化,延迟低,速度快【点击购买】!

阿里云香港、新加坡VPS/1核/1G/25G SSD/1T流量/30M带宽/年付¥288【点击购买】