1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
''' 导入各种包,其中: re.findall用来进行正则匹配 csv用来写csv文件 asyncio和aiohttp用来配合进行异步协程爬虫开发 time用来记录运行时间 logging用来显示错误日志 ''' from re import findall import csv import asyncio import aiohttp import time import logging import operator
baseurl = "https://cd.lianjia.com/zufang" block_list = ["锦江", "青羊", "武侯", "高新", "成华", "金牛", "天府新区", "高新西"]
session = None semaphore = asyncio.Semaphore(8)
''' 这个函数定义了一个基本的用来实现一个使用get方法获取目标网页html文本的接口,相当于requests.get input: A URL output: This URL's HTML ''' async def get(url): async with semaphore: try: logging.info('Getting %s',url) async with session.get(url) as response: return await response.text() except aiohttp.ClientError: logging.error('Error occurred while getting %s',url,exc_info=True)
def get_blockurls(html): result = [] for block in block_list: block_url = findall(r'href="/zufang(.*?)" >'+block, html)[0] result.append(block_url) return result
def get_subblock(html): result = [] html = html.replace("\n","").replace("\r","").replace("\t","").replace(" ","") temp = findall(r'--level3"><ahref="/zufang(.*?)</a>', html) for t in temp: result.append(t.split('">')[1]) return result
def get_roomnum(html): result = 0 result = findall(r'content__title--hl">(.*?)</span>', html)[0] return result
async def get_roomurls(html, num): result = [] pagenum = int((num - (num%30))/30) + 1 html = html.replace("\n","").replace("\r","").replace("\t","").replace(" ","") urls = findall(r'class="content__list--item--aside"target="_blank"href="/zufang(.*?)"title="', html) for u in urls: result.append(baseurl+u) for p in range(2,pagenum+1): html = await get(baseurl+"/pg"+str(p)+"/#contentList") if not html: continue html = html.replace("\n", "").replace("\r","").replace("\t","").replace(" ","") urls = findall(r'class="content__list--item--aside"target="_blank"href="/zufang(.*?)"title="', html) for u in urls: result.append(baseurl+u) return result
async def get_roommessage(html, bname, w2): ''' 无关紧要的正则提取和写文件操作 '''
async def get_rooms(html, num, bname, w2): if num < 1000: room_urls = await get_roomurls(html, num) else: room_urls = await get_roomurls(html, 1000) if not room_urls: return for u in room_urls: room_r = await get(u) if not room_r: continue try: room_message = await get_roommessage(room_r, bname, w2) except: pass
async def geturls(block, bname): blockurl = baseurl + block block_r = await get(blockurl) sub_blocks = get_subblock(block_r) return sub_blocks
async def get_message_main(block, bname, w1, w2): print("运行了main一次") blockurl = baseurl + block block_r = await get(blockurl) room_num = get_roomnum(block_r) w1.writerow([bname , room_num]) result = await get_rooms(block_r, int(room_num), bname, w2)
async def main(): global session session = aiohttp.ClientSession() f1 = open('file1.csv','w',encoding='utf-8') f2 = open('file2.csv','w',encoding='utf-8') w1 = csv.writer(f1) w2 = csv.writer(f2) w1.writerow(['行政区域','挂网租房数量'])
base_r = await get(baseurl) block_urls = get_blockurls(base_r) indextasks = [asyncio.ensure_future(get_message_main(block,bname,w1,w2)) for block,bname in zip(block_urls,block_list)] result = await asyncio.gather(*indextasks)
f1.close() f2.close() await session.close()
def paicsv(): data=[] reader = csv.reader(open("file2.csv","r")) for row in reader: data.append(row) data.sort() with open("file2.csv","w") as f: writer = csv.writer(f) writer.writerow(['行政区域','区域','小区','房型','房源维护时间','面积','朝向','维护','入住','楼层','电梯','车位','用水','用电','燃气','采暖','租期','看房','付款方式','租金','押金','洗衣机','空调','衣柜','电视','冰箱','热水器','床','暖气','宽带','天然气']) for row in data: writer.writerow(row)
if __name__ == '__main__': start = time.time() print("Start at: " , start) asyncio.get_event_loop().run_until_complete(main()) end = time.time() print("End at: " , end) print("Time cost:" , end-start) paicsv()
|