一个简单的多进程爬虫(爬取某加盟创业网)
分享一个简单的多进程小爬虫,爬取某加盟创业网上所有加盟项目;
使用requests请求页面,re和beautifulSoup解析网页,multiprocessing进行多进程,pymongo数据入库;
(打开网站时发现密密麻麻、琳琅满目的加盟项目,但是爬完之后,发现数据也就只有那么一点点,甚至不到一千条数据,这就有点囧了)
# coding:utf-8 """ auther:zmister.com """ import requests from bs4 import BeautifulSoup from multiprocessing import Pool import re import pymongo # 设置MongoDB连接信息 client = pymongo.MongoClient('localhost',27017) cn_78 = client['cn_78'] project_info = cn_78['project_info'] # 浏览器头信息 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', 'Connection': 'keep-alive', } # 获取分类类目名字和链接 def get_all_big_category(): url = 'http://www.78.cn' wbdata = requests.get(url,headers=header).content soup = BeautifulSoup(wbdata,'lxml') cates = soup.select('div.xmFl_left > ul > li > p > a') cate_list = [] for c in cates: cname = c.get_text() curl = c.get("href") cate_list.append(curl) # print(cate_list) return cate_list # 从分类页面获取具体项目信息 def get_project_info(cates): url = 'http://www.78.cn'+cates wbdata = requests.get(url,headers=header).content soup = BeautifulSoup(wbdata,'lxml') cate = soup.select('div.dhy_cont3 > dl.cont3_top > dt > span')[0].get_text() subcate = soup.select('div.dhy_cont3 > dl.cont3_top > dd.check > a')[0].get_text() p_name = re.findall(r'target="_blank">(.*?)</a></h2>',str(soup)) p_mony = re.findall(r'<dt>投资:<em class="red">¥<strong>(.*?)</strong>',str(soup)) p_aear = re.findall(r'<dd>所属区域:(.*?)</dd>',str(soup)) p_time = re.findall(r'<dd>累计时间:(.*?)</dd>',str(soup)) for name,mony,aear,times in zip(p_name,p_mony,p_aear,p_time): data = { 'cate':cate, 'subcate':subcate, 'name':name, 'mony':mony, 'aear':aear, 'time':times, 'minmoney':mony.replace('万','').split('-')[0] if '-' in mony else None, 'maxmoney':mony.replace('万','').split('-')[-1] if '-' in mony else mony[0] } print(data) project_info.insert_one(data) print("插入数据成功") if __name__ == '__main__': pool = Pool(processes=4) pool.map_async(get_project_info,get_all_big_category()) pool.close() pool.join()
不知道对各位有没有帮助?