import time import urllib.request import sys import win32api import win32con from selenium import webdriver from bs4 import BeautifulSoup import xlwt
def main(): line =1 row = 0 keyword = { '0':'序号', '1':'成员QQ昵称', '2':'成员群昵称', '3':'成员QQ号', '4':'成员性别', '5':'成员Q龄', '6':'成员入群时间' } prefix='https://qun.qq.com/member.html#gid=' #前缀 url = input('输入您所想爬取的QQ群号:') number = input('输入该群的人数:') present_url = prefix + url driver = webdriver.Firefox() driver.get(present_url) boolen = True while boolen: in_content = input('请扫描浏览器中的二维码,若已扫描登录输入"Y",若不想继续登录请输入“N”:') if in_content == "Y" or in_content == "y": boolen = False elif in_content == "N" or in_content == 'n': sys.exit() else: print("您的输入有误,请重新输入")
for i in range(1,((int(number)+1)//21)+1):#此for语句作用:模拟滚动,以加载群成员数据 driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") time.sleep(1)
content = driver.page_source #获取网页文件保存在content中 website_name = url+'.html' with open(website_name, 'w',encoding='utf-8') as f: f.write(content) book = xlwt.Workbook(encoding='utf-8') #新建xls文件 sheet = book.add_sheet('sheet',cell_overwrite_ok=True) #新建数据表 file = open(website_name, 'rb') #打开从网页上保存下来的HTML文件 html = file.read() soup = BeautifulSoup(html,"lxml") bodys = soup.find_all('div',class_='body') book_name = url + '.xls' #book_name变量是excel的文件名 for word in range(7): #此for语句是为了在excel表格的第一行中写入信息 sheet.write(0,word,keyword[str(word)]) for body in bodys: dls = body.find_all('dl') for dl in dls: dds = dl.find_all('dd') for dd in dds: div_classes = dd.find_all('div',class_='group-memeber') for div_class in div_classes: tables = div_class.find_all('table') #左侧的多重For循环为了在HTML中提取群成员数据 for table in tables: tbodys = table.find_all('tbody',class_='list') for tbody in tbodys: trs = tbody.find_all('tr') for tr in trs: for row in range(7): sheet.write(line,row,list(tr.stripped_strings)[row]) #往excel表格中写入群成员数据 line = line +1 book.save(book_name)#保存工作簿 print('已将群成员数据保存在本地目录下!!!!!!!') main()