공공데이터포털 API 사용
- 접속(https://www.data.go.kr/)
- 가입
- 데이터 활용 신청 (https://www.data.go.kr/data/15059468/openapi.do)
- OpenAPI 인증키 발급: 마이페이지 > 데이터활용 > Open API > 인증키 발급 현황
기상청 데이터 API 사용
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import requests
url = "http://apis.data.go.kr/1360000/MidFcstInfoService/getMidFcst"
params = {
'serviceKey':'{개인인증키}',
'numOfRows':'10',
'pageNo':'1',
'dataType':'JSON',
'stnId':'108',
'tmFc':'202308020600'
}
response = requests.get(url,params=params)
data = response.json()
print(data)
개인인증키 사용시: Encoding이 아닌 Decoding으로 사용해야 한다
JSON 보기 쉽게 출력:pprint(prettyprint)
1
2
import pprint
pprint.pprint(data)
Selenium
- 동적 크롤링을 위한 라이브러리
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import requests
import time
release = 'https://chromedriver.storage.googleapis.com/LATEST_RELEASE'
version = requests.get(release).text
service = Service(ChromeDriverManager(version=version).install())
d = webdriver.Chrome(service=service)
try:
d.get("https://naver.com")
except Exception as e:
print(e)
finally:
time.sleep(2)
d.close()
d.quit()
Webdriver이 Chrome의 최신 버전대로 업데이트가 되지 않아있어 문제발생했지만, release정보를 받아와 version에 입력해줌으로써 해결
네이버 뉴스 기사 동적 크롤링
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import requests
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
release = 'https://chromedriver.storage.googleapis.com/LATEST_RELEASE'
version = requests.get(release).text
service = Service(ChromeDriverManager(version=version).install())
d = webdriver.Chrome(service=service)
try:
d.get("https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=105")
section = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".section_body")))
lis = section.find_elements(By.CSS_SELECTOR, "ul > li")
for li in lis:
print(li.text)
for i in range(2, 6):
print(f"================================{i}========================================")
page_area = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#paging")))
page_2 = page_area.find_element(By.LINK_TEXT, f"{i}")
page_2.click()
time.sleep(0.5)
section = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".section_body")))
lis = section.find_elements(By.CSS_SELECTOR, "ul > li")
for li in lis:
print(li.text)
except Exception as e:
print(e)
finally:
time.sleep(2)
d.close()
d.quit()
WebDriverWait
과EC.presence_of_element
을 사용하여time.sleep()
를 없애고, 대기시간을 최소화 시켜보려 했으나, ‘Message: stale element reference: stale element not found’ 에러 발생해결:
BeautifulSoup
의select
,Selenium
의find_elements
처럼 EX.presence_of_all_elements` 를 쓰면 된다
멜론차트 100
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from fake_useragent import UserAgent
import pymysql
from openpyxl import Workbook
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import requests
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# db 연결
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()
# Excel 연결
wb = Workbook()
ws = wb.active
# webdriver 접속
release = 'https://chromedriver.storage.googleapis.com/LATEST_RELEASE'
version = requests.get(release).text
service = Service(ChromeDriverManager(version=version).install())
driver = webdriver.Chrome(service=service)
try:
url = f"https://www.melon.com/chart/index.htm"
driver.get(url)
tbody = driver.find_elements(By.CSS_SELECTOR, "#frm > div > table > tbody > tr")
for tr in tbody:
rank = int(tr.find_element(By.CSS_SELECTOR, "span.rank").text.replace("'", '"'))
title = tr.find_element(By.CSS_SELECTOR, "div.rank01 > span > a").text.replace("'", '"')
singer = tr.find_element(By.CSS_SELECTOR, "div.ellipsis.rank02 > a").text.replace("'", '"')
album = tr.find_element(By.CSS_SELECTOR, "div.rank03 > a").text.replace("'", '"')
like = int(tr.find_element(By.CSS_SELECTOR, "button > span.cnt").text.replace(",", '').strip())
diff = "new"
try:
tr.find_element(By.CSS_SELECTOR, "span.rank_wrap > span.none")
diff = "-"
except: pass
try:
tr.find_element(By.CSS_SELECTOR, "span.rank_wrap > span.down")
diff = "-"+tr.find_element(By.CSS_SELECTOR, "span.rank_wrap > span.down").text
except: pass
try:
tr.find_element(By.CSS_SELECTOR, "span.rank_wrap > span.up")
diff = "+"+tr.find_element(By.CSS_SELECTOR, "span.rank_wrap > span.up").text
except: pass
sql = f"""
INSERT INTO melon VALUES(NULL, '{rank}', '{title}', '{singer}', '{album}', '{like}', '{diff}', '00')
"""
cursor.execute(sql)
ws.append([rank, title, singer, album, like, diff, '00'])
except Exception as e:
print(e)
finally:
time.sleep(2)
db.commit()
db.close()
driver.close()
driver.quit()
wb.save("melon.xlsx")
중고나라
- iframe형태로 url이 직접 바뀌지 않고 검색내용을 동적으로 불러옴
switch_to.frame()
함수로 frame내부로 옮겨감switch_to.default_content()
로 원래로 돌아감
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import requests
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
release = 'https://chromedriver.storage.googleapis.com/LATEST_RELEASE'
version = requests.get(release).text
service = Service(ChromeDriverManager(version=version).install())
d = webdriver.Chrome(service=service)
try:
d.get("https://cafe.naver.com/joonggonara")
elem = d.find_element(By.CSS_SELECTOR, "#topLayerQueryInput")
elem.send_keys("자전거")
elem.send_keys(Keys.RETURN)
iframe = d.find_element(By.CSS_SELECTOR, "#cafe_main")
d.switch_to.frame(iframe)
article = d.find_element(By.CSS_SELECTOR, ".article")
print(article.text)
except Exception as e:
print(e)
finally:
time.sleep(2)
d.close()
d.quit()
중고나라 여러개 검색해서 페이지 1~5 엑셀로 검색별 시트 만들어 넣기
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# 중고나라에서 자전거 검색하여 1~5페이지 제목, 작성자, 작성일 excel에 표시
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import requests
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
from selenium import webdriver
# 함수로 만들어 *args사용해 여러개 인자를 받음
def crawl(*args):
# Excel 연결
wb = Workbook()
# webdriver 접속
release = 'https://chromedriver.storage.googleapis.com/LATEST_RELEASE'
version = requests.get(release).text
service = Service(ChromeDriverManager(version=version).install())
d = webdriver.Chrome(service=service)
try:
for arg in args:
ws = wb.create_sheet(arg) # 새 시트 생성(바로 active됨)
# ws.append([arg])
ws.append(['title', 'writer', 'date'])
# 중고라나 접속 후 키워드 입력
d.get("https://cafe.naver.com/joonggonara")
elem = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#topLayerQueryInput")))
elem.send_keys(arg)
elem.send_keys(Keys.RETURN)
# iframe으로 이동
iframe = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#cafe_main")))
d.switch_to.frame(iframe)
# 페이지 iterate
for i in range(2, 6):
trs = d.find_elements(By.CSS_SELECTOR, '#main-area > div:nth-child(5) > table > tbody > tr')
# 제목, 작성자, 날짜 가져오기
for tr in trs:
title = tr.find_element(By.CSS_SELECTOR, 'td.td_article > div.board-list > div > a').text
writer = tr.find_element(By.CSS_SELECTOR, 'td.td_name > div > table > tbody > tr > td > a').text
date = tr.find_element(By.CSS_SELECTOR, 'td.td_date').text
# 엑셀에 값 추가
ws.append([title, writer, date])
if i==6:
break
# 페이지 이동
page_area = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#main-area > div.prev-next")))
go_to_page = page_area.find_element(By.LINK_TEXT, f"{i}")
go_to_page.click()
# 검색을 하기 위해 원래 frame으로 이동
d.switch_to.default_content()
except Exception as e:
print(e)
finally:
# Webdriver 종료
d.close()
d.quit()
# 엑셀 저장
wb.save("joonggonara.xlsx")
# 여러 인자 넣어 실행
crawl("자전거", "노트북", "키보드", "모니터")