使用python爬取壁纸网站

第一次写博客，没什么好写的，贴上个学期用python爬壁纸网站的代码出来吧。一开始是逛知乎的时候看见的这个壁纸网站facets 点开看见里面壁纸还行，就打算写python把网站所有壁纸给爬下来先贴代码

from threading import Thread
import time
import os
import re
from bs4 import BeautifulSoup
import requests

link = "http://www.facets.la/wallpaper/W"

class Spider(Thread):
    def __init__(self, url):
        super(Spider, self).__init__()
        self.url = url
        self.download = []

    def run(self):
        self.get_link()

    def get_link(self):
        html = requests.get(self.url)
        soup = BeautifulSoup(html.text, 'lxml')
        for item in soup.find_all(class_={"thumb", 'thumb-large'}):
            url = item.find('img').get('src')  #抓取到的是缩略图的url，与实际的url存在对应关系
            url = link + url[url.find('_'):]
            name = item.find('span', attrs={"style": "font-weight: 400;"}).text
            if url is not None:
                dic = {'name': name, 'pic': url}
                self.download.append(dic)
        global start
        print("在%d秒内完成对网页%s的图片链接抓取,开始下载" % (time.time() - start, self.url))
        for dic in self.download:
            filename = dic['name'] + '.jpg'
            print('downloading....:%s' % dic['name'])
            try:  #有可能会出现抓取的图片带有不能够作为路径名的字符
                with open('python_download_picture/' + filename, 'wb') as f:
                    img = requests.get(dic['pic']).content
                    f.write(img)
            except Exception as e:
                print(e, filename)
                global error_count
                filename = "picture" + str(error_count) + ".jpg"
                error_count += 1
                print("rename as %s" % filename)
                with open('python_download_picture/' + filename, 'wb') as f:
                    img = requests.get(dic['pic']).content
                    f.write(img)

def main():
    if not os.path.exists("python_download_picture"):
        os.mkdir("python_download_picture")
    url_list = ['http://www.facets.la/']  #定义url_list用于储存目标网页
    process_list = []  # 定义process_list用于储存线程
    for num in range(63, 316, 63):
        url = "http://www.facets.la/offset/" + str(
            num) + '/'  # store mission information
        url_list.append(url)
    for url in url_list:
        p = Spider(url)
        p.start()
        process_list.append(p)
        p.join()

if __name__ == "__main__":
    start = time.time()
    error_count = 0  #记录下载错误数
    main()
    print("finish in %d seconds" % (time.time() - start))

下面正式讲述爬取过程：

网站分析 file 观察网站前三页网站的url分别为： http://www.facets.la http://www.facets.la/offset/63/ http://www.facets.la/offset/126/ 可以看到每一页url递增63，最后一页的网址是 http://www.facets.la/offset/315/ 所以可以用一个列表来储存所有待爬取的url网址，对应上述代码

url_list = ['http://www.facets.la/']  #定义url_list用于储存目标网页
for num in range(63, 316, 63):
url = "http://www.facets.la/offset/" + str(
    num) + '/'  # store mission information
url_list.append(url)

2.网页元素分析使用chrome自带的审查元素分析所有带有图片链接的名称和图片的div标签class为”thumb”, ‘thumb-large’，第一张比较特殊是’thumb-large’，后面都是”thumb” file 但注意看这个只是缩略图图片链接为 http://www.facets.la/fullview/F_2014_365.jpg 但是当你点进图片看见download paper对应的元素链接却应该是http://www.facets.la/wallpaper/W\_2014\_365.jpg 观察几个图片能够发现完整图片的前缀一定是“**http://www.facets.la/wallpaper/W**”再加入缩略图最后一个大写字母之后的元素如上一张图片的元素是“_2014_365.jpg” 搞定图片链接之后图片的名称就比较好搞定，检查元素一点可以看到每一个图片第一幅agent j为例，包含名字的html为 <span style="font-weight: 400;">Agent J</span>只要选出style=”font-weight: 400;”的span标签即可使用beautifulsoup库解析如下

html = requests.get(self.url)
soup = BeautifulSoup(html.text, 'lxml')
for item in soup.find_all(class_={"thumb", 'thumb-large'}):
url = item.find('img').get('src')  #抓取到的是缩略图的url，与实际的url存在对应关系
url = link + url[url.find('_'):]
name = item.find('span', attrs={"style": "font-weight: 400;"}).text
if url is not None:
     dic = {'name': name, 'pic': url}
     self.download.append(dic)

注意事项由于是爬取图片名称直接给图片命名，有可能爬下来的图片名称会带有‘？’等一些不能够使用作为路径名特殊字符，在进行命名时候使用try-except捕获这种错误并对其进行单独命名 4.正式开始爬取网站在爬取过程中使用了python中的Thread类，使用Spider类重写run方法，具体过程不再赘述，贴几篇看多线程时看过的博客 [python采用多进程/多线程/协程写爬虫以及性能对比][python采用多进程/多线程/协程写爬虫以及性能对比]

[python采用多进程/多线程/协程写爬虫以及性能对比]: https://www.cnblogs.com/huangguifeng/p/7632799.html “python采用多进程/多线程/协程写爬虫以及性能对比”