爬取百度搜索

在用谷歌语法搜索有某些特征的链接时，如果想把这些链接全部保存起来，这个时候就可以使用爬虫技术，爬取这些链接保存下来。下面就来分析并写出这个爬虫程序。

网页分析

分析搜索链接

每页的网页链接格式，一般都有固定的链接格式，如百度的每页搜索结果链接是只取两个个参数的结果是这样，每页10条

1	https://www.baidu.com/s?wd=ctf&pn=10

分析搜索页面中的链接

F12对当前页面分析每个链接的特点，百度搜索有点坑，你会发现百度都是通过一个长长的链接302跳转来访问的，随便选取一个链接都是这种

1	a target="_blank" href="你搜索的URL" class="c-showurl" style="text-decoration:none;">www.php.net/downloa...php

特征就是class=”c-showurl” 属性值，用bs库去获取所有有这个属性的tagres = soup.find_all(name=”a”, attrs={‘class’:’c-showurl’})

访问链接

访问跳转链接获取实际网站url,title之类的信息

爬虫实现

#!/usr/bin/env python
#coding=utf-8
#输入格式  python 脚本 -s 内容 -f 要保存的文件名
#每页的网页链接格式，一般都有固定的链接格式，如百度的每页搜索结果链接是只取两个个参数的结果是这样，每页10条
#https://www.baidu.com/s?wd=ctf&pn=10
#F12对当前页面分析每个链接的特点，百度搜索有点坑，你会发现百度都是通过一个长长的链接302跳转来访问的，随便选取一个链接都是这种
#<a target="_blank" href="http://www.baidu.com/link?url=GI9K125i3rnLbxL2-kKs-2g2OZt-oDTJZZIFjndQHxGiDubfIEpvNxnnCc1h5ags" class="c-showurl" style="text-decoration:none;">www.secbox.cn/tag/<b>ctf</b>&nbsp;</a>
import requests	
from bs4 import BeautifulSoup as bs
import threading	#多线程
import re  #正则
from Queue import Queue  #线程优先级队列（ Queue）
from prettytable import PrettyTable  #将输出内容如表格方式整齐 
import argparse  #命令行解析
import time
import sys
thread_count = 3 #进程数
page = 5 #可以修改抓取页数
urls = []
table =  PrettyTable(['page','url','title']) #prettyx模块将输出内容如表格方式整齐
table.align['title'] = '1' #title左对齐
table.padding_width = 1  #列边和内容之间的一个空格
page = (page+1) * 10
class mythread(threading.Thread):  #继承父类threading.Thread
	def __init__(self,queue):
		threading.Thread.__init__(self)
		self.Q = queue
		self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}  #设置请求头
	def run(self):   ##把要执行的代码写到run函数里面 线程在创建后会直接运行run函数 
		while 1:
			try:
				t = self.Q.get(True,1)
				#print t
				self.spider(t)
			except Exception,e:  #调试最好打印出错信息，否则，spider函数出错也无法定位错误，多次遇到这个问题了,靠打印才解决
				print e
				break
	def spider(self,target):  #爬取网页链接和标题
		#print type(target)
		pn =int(target.split('=')[-1])/10 + 1  #对https://www.baidu.com/s?wd=ctf&pn=10分割去最后的数字
		#print pn
		#print target
		html = requests.get(target,headers=self.headers)
		#print html
		soup = bs(html.text,'lxml')
		res = soup.find_all(name='a', attrs={'class':'c-showurl'})
		#print res
	
		for r in res:
			try:
				#因为百度搜索是302跳转，所以我们需要再次请求
				h = requests.get(r['href'],headers=self.headers,timeout=3)
				if h.status_code == 200:
					url = h.url
					title =re.findall(r'<title>(.*?)</title>',h.content)[0]
					title = title.decode('utf-8')  #解码成unicode,否则add_row会转换出错
					urls.append((pn,url,title))
				else:
					continue
			except:
				continue
def Load_Thread(queue):   #生成线程数
	return [mythread(queue) for i in range(thread_count)]
def Start_Thread(threads):
	print 'thread is start...'
	for t in threads:
		t.setDaemon(True)
		t.start()
	for t in threads:
		t.join()
	print 'thread is end...'
def main():
	start = time.time()
	parser = argparse.ArgumentParser()
	parser.add_argument('-s')
	parser.add_argument('-f')
	arg = parser.parse_args()
	#print arg
	
	word = arg.s
	output = arg.f
	# word = 'inurl:login.action'
	# output = 'test.txt'
	queue = Queue()
	for i in range(0,page,10):
		target = 'https://www.baidu.com/s?wd=%s&pn=%s'%(word,i)
		queue.put(target)
	thread_list = Load_Thread(queue)
	Start_Thread(thread_list)
	
	#把数据写到文件中
	if output:
		with open(output,'a') as f:
			for record in urls:
				f.write(record[1]+'\n')
	#print urls,len(urls)
	for record in urls:
		table.add_row(list(record))  #在表单中添加数据
	print table
	print '共爬取数据%s条'%len(urls)
	print time.time()-start
if __name__ == '__main__':
	main()