bs4를 사용하여 html 셀럭터로 접근 가능
import requests from bs4 import BeautifulSoup
웹 크롤링 할 때 그냥 보내면 네이버가 검색을 차단함
그래서 검색하는 컴퓨터가 사람이라고 알려야함
headers = {'User-Agent':'Mozilla/5.0', 'referer' : 'http://www.naver.com'}
웹 크롤링 쓰레드 사용
import threading
t = threading.Thread(target=clickMe)
t.start()
랜덤 시간 만큼 멈춤
from random import random
n = random()
time.sleep(n+0.4)
11/11 -> user-agent 를 수정해도 결국 차단됨
11/12 -> fake user-agent를 사용, 난수 사용 ( 아직 차단은 안 당했는데 테스트 더 해야함 )
head = {'User-Agent': str(ua.random), 'referer': 'http://www.naver.com'}
최종 -> fake user-agent 사용, referer 제거 ( 차단 안 당함)
head = {'User-Agent': str(ua.random)}
코드
import time import requests from bs4 import BeautifulSoup from openpyxl import Workbook search = "취성패"start = 20181108end = 20181108batch = 2headers = {'User-Agent':'Mozilla/5.0', 'referer' : 'http://www.naver.com'} c = 1wb = Workbook() ws = wb.active ws['A1'] = '주소'wsc=2urldata = [] def req(st,en,se,cc) : req = requests.get("https://search.naver.com/search.naver?f=&fd=2&filetype=0&nso=so%3Ar%2Ca%3Aall%2Cp%3Afrom"+str(st)+"to"+str(en)+"&query="+se+"&research_url=&sm=tab_pge&start="+str(cc)+"&where=webkr", headers=headers) html = req.text soup = BeautifulSoup(html, 'html.parser') return soup def getUrl(my_titles) : for title in my_titles: urldata.append(title.text) while start <= end : print("날짜 :" + str(start) + " 웹 크롤링 중") currentpage=1 soup =req(start,end,search,currentpage) count = soup.find('span', class_='title_num') if ('/' in count.text): c = int(count.text.split('/')[1].split('건')[0]) else: c = 1 while currentpage<=c : soup = req(start, end, search, currentpage) my_titles = soup.find_all('a', class_='txt_url') getUrl(my_titles) currentpage=currentpage+10 time.sleep(0.5) start = start + batch break; urldata=list(set(urldata)) for data in urldata : ws['A'+str(wsc)]=data wsc=wsc+1 wb.save('result.xlsx')