本文共 9942 字,大约阅读时间需要 33 分钟。
这是我们课程实训的一个功能模块,实现将steam,epic,origin游戏价格信息爬取出来,由于三个网站的构造不一样,加载数据的方式也不一样所以我们需要采用不同的方法来爬取这三个平台的游戏数据
BeautifulSoup包 提取爬取网页标签的属性值(游戏的价格信息等)selenium的webdriver 利用脚本实现动态加载数据requests 爬取网页数据用
首页urlhttps://store.steampowered.com/search/?specials=1&page=1
steam网站的游戏数据是分页的,我们可以通过url拼接进行爬取
获取页面html信息
# 获取页面信息def getPage(pagenum): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36'} #https://store.steampowered.com/search/?specials=1&page=1 urlh = "https://store.steampowered.com/search/?specials=1&page=" url = urlh+str(pagenum); print(url) reponse = requests.get(url, headers=headers) reponse.encoding = 'utf-8' return reponse.text
将html信息存入txt文件中
进行这一步是为了方便测试,也是为了如果爬取的页面发生变动或更新,相当于留了一个备份def saveHtmlCode(html,path): file = open(path, "wb") file.write(html)
提取页面中的游戏数据
这里面要进行数据清理,去除游戏价格中多余的特殊字符(空格,换行等)# 获取游戏信息def getGameInfo1(html,game_list): global count soup = BeautifulSoup(html, 'html.parser') # 游戏列表 games_Info = soup.find(id='search_resultsRows') games_a = games_Info.find_all('a'); for i in range(0,len(games_a)): #print(games_a[i]) #商品是否打折 is_free = games_a[i].find('div',class_="col search_price responsive_secondrow"); if(is_free!=None): continue #获取商品src game_src = games_a[i].find('img')['src'] #print("src = "+game_src) #获取商品名字 game_name = games_a[i].find('span',class_='title').get_text() #print("name = "+game_name) #获取商品折扣 game_discount = stripAndreplace(games_a[i].find('div',class_="col search_discount responsive_secondrow").get_text()) #print(game_discount) #获取折扣信息 priceText = stripAndreplace(games_a[i].find('div',class_="col search_price discounted responsive_secondrow").get_text()) priceText = split(priceText) if (len(priceText) < 2): continue #获取商品原来价格 game_original_price = priceText[0].replace(' ',''); #获取商品折扣价格 game_final_price = priceText[1].replace(' ',''); #print("op = "+game_original_price+" fp = "+game_final_price) #来源 game_source = 'steam' game_now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M') # name, original_price,final_price,discount,source,src game = EpicGamePrice.Game(game_name,game_original_price,game_final_price,game_discount,game_source,game_src,game_now_time) print(game.printProperty()) game_list.append(game)
将爬取的游戏信息存入csv文件中
# game_list中代表的是游戏信息list,path是存入的路径def write_to_excel(game_list,path): file = open(path,'w',encoding='utf-8-sig') csv_writer = csv.writer(file) csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date']) for i in range(0,len(game_list)): game = game_list[i]; csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date]) file.close()
csv文件展示
我是爬了200页数据首页urlurl = "https://www.epicgames.com/store/zh-CN/browse?sortBy=releaseDate&sortDir=DESC&pageSize=30"
因为epic数据是需要点击下面这个加载更多来显示更多游戏的
所以我们第一步需要写一个脚本来点击这个加载按钮,这就用到了我们的webdriver 1. 编写脚本来点击加载更多按钮实现加载更多数据并获取网页源代码#打开浏览器def openFireFoxDiver(): url = "https://www.epicgames.com/store/zh-CN/browse?sortBy=releaseDate&sortDir=DESC&pageSize=30" driver = webdriver.Firefox() driver.get(url) time.sleep(10) button = driver.find_element_by_id('browse-pagination') button.click() #等待15秒让网页加载完全数据 time.sleep(15) html_page = driver.page_source.encode('utf-8') # 取得网页的源代码 return html_page;
获取页面游戏信息
#获取页面信息def getGameInfo(game_list,path): html =getHtmlCode(path) html_page_soup = BeautifulSoup(html, 'html.parser') gameInfo = html_page_soup.find_all('li', class_='css-1adx3p4-BrowseGrid-styles__card') print(len(gameInfo)) #print(gameInfo) for i in range(0,len(gameInfo)): if(i==2): print(gameInfo[i]) #游戏图片 game_img = gameInfo[i].find('img') #游戏图片链接 game_src = '' #游戏名称 game_name ='' #游戏现价 game_original_price = '' if game_img.has_attr('data-image'): game_src = game_img['data-image']; #print(game_src +" "+str(i)) else: continue if game_img.has_attr('alt'): game_name = game_img['alt'] #print(game_name + " " + str(i)) else: continue #游戏折扣 game_discount = gameInfo[i].find('span',attrs={ 'data-component':'DiscountAmount'}) if(game_discount==None): game_discount = 'no discount'; else: game_discount = game_discount.get_text(); print(game_discount) #游戏现价 game_final_price = gameInfo[i].find('span',attrs = { 'data-component':'Price'}) if(game_final_price==None): game_final_price = '0' else: game_final_price = game_final_price.get_text() #print(game_final_price) #游戏原价 game_original_price = gameInfo[i].find('s',attrs ={ 'data-component':'Price'}) if(game_original_price==None): game_original_price = game_final_price else: game_original_price = game_original_price.get_text() #print(game_original_price) game_final_price = stripAndreplace(game_final_price) game_original_price = stripAndreplace(game_original_price) game_final_price = str(round(float(game_final_price)*6.53,2)) game_original_price = str(round(float(game_original_price)*6.53,2)) print("op = " + str(game_original_price) + " fp = " + str(game_final_price)) game_source = 'epic' game_now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M') game = Game(game_name, game_original_price, game_final_price, game_discount, game_source, game_src,game_now_time) print(game.printProperty()) game_list.append(game)
写入csv文件
def write_to_excel(game_list,path): file = open(path,'w',encoding='utf-8-sig') csv_writer = csv.writer(file) csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date']) for i in range(0,len(game_list)): game = game_list[i]; csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date]) file.close()
爬取页面展示
def write_to_excel(game_list,path): file = open(path,'w',encoding='utf-8-sig') csv_writer = csv.writer(file) csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date']) for i in range(0,len(game_list)): game = game_list[i]; csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date]) file.close()
首页urlurl = "https://www.origin.com/hkg/en-us/store/deals/holidaysale"
origin平台的游戏数据是通过下拉滑动框来进行动态加载的,因此我们需要写一个脚本来实现定时将滑动框移到最低端,这个操作重复20次就够了,因为origin平台就这么些游戏
编写脚本实现自动下拉滑动框并获取网页源代码#打开浏览器def openFireFoxDiver(): #url = 'https://www.origin.com/hkg/en-us/store/browse?fq=platform:pc-download' url = "https://www.origin.com/hkg/en-us/store/deals/holidaysale" driver = webdriver.Firefox() driver.get(url) time.sleep(15) for i in range(0,30): driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(5) print(i) html_page = driver.page_source.encode('utf-8') # 取得网页的源代码 return html_page;
获取页面游戏信息
#获取页面信息def getGameInfo(game_list,path): html = EpicGamePrice.getHtmlCode(path) html_page_soup = BeautifulSoup(html, 'html.parser') gameInfo = html_page_soup.find_all('origin-store-bundle-offer') # print(len(gameInfo)) # print(gameInfo[0]) for i in range(0,len(gameInfo)): #游戏图片src game_src = gameInfo[i].find('img')['src'] # print(game_src +" "+str(i)) #游戏名称 game_name = gameInfo[i].find('h2',attrs={ 'class':'otktitle-4 origin-storebundleoffer-title'}).get_text() # print(game_name) #游戏现价 game_final_price = gameInfo[i].find('p', attrs={ 'class': 'origin-store-offerprice-price otkprice'}) if(game_final_price==None): game_final_price='-1' else: game_final_price = game_final_price.get_text(); game_final_price = stripAndreplace(game_final_price) # print(game_final_price) #游戏折扣 game_discount = gameInfo[i].find('span', attrs={ 'class': 'otkprice-sale'}); if(game_discount==None): game_discount = 'Internert not connect' else: game_discount = gameInfo[i].find('span',attrs={ 'class':'otkprice-sale'}).get_text(); game_discount = dealSaveUpto(game_discount) # print(game_discount) #游戏原价 [game_original_price,game_final_price] = getOriginPriceAndFinalPrice(game_final_price,game_discount) #print(game_original_price) #游戏来源 game_source = 'origin' game_now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M') game = EpicGamePrice.Game(game_name, game_original_price, game_final_price, game_discount, game_source, game_src,game_now_time) print(game.printProperty()) game_list.append(game)
将数据写入csv文件中
def write_to_excel(game_list,path): file = open(path,'w',encoding='utf-8-sig') csv_writer = csv.writer(file) csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date']) for i in range(0,len(game_list)): game = game_list[i]; csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date]) file.close()
爬取效果展示
转载地址:http://yjaen.baihongyu.com/