Python爬虫——乌云厂商+域名

乌云厂商列表的爬虫,乌云屏蔽了requests的默认user-agent头,直接改成空就可以抓到了。郁闷了半天。

#!/usr/bin/env python
#coding:utf-8
__author__ = 'xMyth.mtfly'

import requests
import re

def get_list(b):
	c = re.findall('370"><a href="/corps/(.*?)">', b)
	f = open('list.txt', 'a')
	for lists in c:
		f.write(lists + '\n')
	f.close()

def get_urls(b):
	d = re.findall('target="_blank">(.*?)</a></td>', b)
	f = open('urls.txt', 'a')
	for urls in d:
		f.write(urls + '\n')
	f.close()


def get_url():
	for page in range(1, 40):
		headers = {'User-Agent': ''}
		a = requests.get('http://www.wooyun.org/corps/page/%d' % page, headers = headers)
		b = a.content
		get_list(b)
		get_urls(b)

if __name__ == '__main__':
        get_url()

代码及厂商列表

http://pan.baidu.com/s/1hqiy0Vq