BeautifulSoup
Naver 지식인 타이틀 페이지네이션
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
headers = {
"User-Agent":ua.random
}
for i in range(1, 10):
link = f'https://kin.naver.com/search/list.naver?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&page={i}'
res = requests.get(link, headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
uls = bs.select_one('#s_content > div.section > ul')
titles = uls.select('dt > a')
for title in titles:
print(title.text)
print(title.attrs.get("href"))
Pymysql
- database에 접속하기 위한 라이브러리
Pymysql을 이용한 데이터 조회
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!pip install pymysql
import pymysql
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="market_db")
cursor = db.cursor()
sql = """
SELECT * FROM member;
"""
cursor.execute(sql)
result = cursor.fetchmany(size=100)
for data in result:
print(data)
db.close()
db.close()를 항상 주의해줄것(주피터 노트북 환경이라 더욱 신경써줘야 한다)
Pymysql을 이용한 데이터 삽입
1
2
3
4
5
6
7
8
9
10
11
12
import pymysql
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="market_db")
cursor = db.cursor()
sql = """
INSERT INTO member VALUES('ABC', '에이비씨', 10, '경기', '031', '11122233', 170, '2023-08-01');
"""
cursor.execute(sql)
db.commit()
db.close()
Workbench에서 새로운 스키마, 테이블 생성
실습: 네이버 뉴스 파이썬 검색, 여러 페이지 크롤링 후 database 삽입
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 네이버 뉴스 파이썬 검색, 여러 페이지 크롤링 후 database 삽입
import requests
import pymysql
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
ua = UserAgent()
headers = {
"User-Agent":ua.random
}
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()
for i in range(10):
link=f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=41&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={i*10+1}"
res = requests.get(link, headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
try:
print(i)
ul = bs.select_one('#main_pack > section > div > div.group_news > ul')#main_pack > section > div > div.group_news > ul
divs = ul.select('div.news_wrap.api_ani_send')
for div in divs:
title = div.select_one('a.news_tit').text.replace("'", '"')
link = div.select_one('a.news_tit').attrs.get("href").replace("'", '"')
content = div.select_one('a.dsc_txt_wrap').text.replace("'", '"')
count = content.count('파이썬')
sql = f"""
INSERT INTO link VALUES (NULL, '{title}', '{link}', '파이썬', '{content}', {count}, '2022-08-01', now())
"""
cursor.execute(sql)
time.sleep(1)
except:
print("error", i)
db.commit()
db.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
## 각 링크 안에 있는 내용을 읽어와서 content에 저장
# count도 실제 content에서 갯수를 세서 저장
# content가 너무 길다는 에러 메시지가 나온다면 content[:10000] 슬라이싱 해서 저장
# 네이버 뉴스에서 파이썬 검색시 1000 페이지 크롤링
import requests
import pymysql
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
ua = UserAgent()
headers = {
"User-Agent":ua.random
}
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()
for i in range(10):
url=f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=41&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={i*10+1}"
res = requests.get(url, headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
try:
print(i)
ul = bs.select_one('#main_pack > section > div > div.group_news > ul')#main_pack > section > div > div.group_news > ul
divs = ul.select('div.news_wrap.api_ani_send')
for div in divs:
title = div.select_one('a.news_tit').text.replace("'", '"')
link = div.select_one('a.news_tit').attrs.get("href").replace("'", '"')
headers = {"User-Agent":ua.random}
content = requests.get(link, headers=headers).text[:10000].replace("'", '"')
count = content.count('파이썬')
sql = f"""
INSERT INTO link VALUES (NULL, '{title}', '{link}', '파이썬2', '{content}', {count}, '2022-08-01', now())
"""
cursor.execute(sql)
time.sleep(0.1)
except Exception as e:
print("error", i, e)
db.commit()
db.close()
Openpyxl
한 셀에 값 입력
1
2
3
4
5
6
7
8
9
#pip install openpyxl
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws["A1"] = "Test-Data"
wb.save("result.xlsx")
여러 셀에 값 입력
1
2
3
4
5
6
7
8
9
#pip install openpyxl
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
for row in range(10):
ws.append([row, f"{row}-data"])
wb.save("result.xlsx")
엑셀 읽어오기
1
2
3
4
5
6
7
8
#pip install openpyxl
from openpyxl import load_workbook
wb = load_workbook("result.xlsx")
ws = wb.active
for row in ws.iter_rows():
print(row[0].value, row[1].value)
네이버 뉴스 크롤링해 엑셀로 저장
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
## 각 링크 안에 있는 내용을 읽어와서 content에 저장
# count도 실제 content에서 갯수를 세서 저장
# content가 너무 길다는 에러 메시지가 나온다면 content[:10000] 슬라이싱 해서 저장
# 네이버 뉴스에서 파이썬 검색시 1000 페이지 크롤링
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ua = UserAgent()
headers = {
"User-Agent":ua.random
}
ws.append(["row", "title", "link", "content", "count"])
for i in range(10):
url=f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=41&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={i*10+1}"
res = requests.get(url, headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
try:
print(i)
ul = bs.select_one('#main_pack > section > div > div.group_news > ul')#main_pack > section > div > div.group_news > ul
divs = ul.select('div.news_wrap.api_ani_send')
for div in divs:
title = div.select_one('a.news_tit').text.replace("'", '"')
link = div.select_one('a.news_tit').attrs.get("href").replace("'", '"')
headers = {"User-Agent":ua.random}
content = requests.get(link, headers=headers).text[:10000].replace("'", '"')
count = content.count('파이썬')
ws.append([i, title, link, content, count])
break
except Exception as e:
print("error", i, e)
wb.save("result.xlsx")
실습: 멜론차트 100위
멜론차트 100위 시간별로 db, excel에 입력
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pymysql
from openpyxl import Workbook
ua = UserAgent()
headers = {"User-Agent":ua.random}
# db 연결
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()
# Excel 연결
wb = Workbook()
ws = wb.active
for time in range(17):
url = f"https://www.melon.com/chart/index.htm?dayTime=20230801{str(time).zfill(2)}"
res = requests.get(url, headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
tbody = bs.select("#frm > div > table > tbody > tr")
for tr in tbody:
rank = int(tr.select_one("span.rank").text.replace("'", '"'))
title = tr.select_one("div.rank01 > span > a").text.replace("'", '"')
singer = tr.select_one("div.rank02 > span > a").text.replace("'", '"')
album = tr.select_one("div.rank03 > a").text.replace("'", '"')
# like = tr.select_one("button > span.cnt")
if tr.select_one("span.rank_wrap > span.none"):
diff = "-"
elif tr.select_one("span.rank_wrap > span.down"):
diff = "+"+tr.select_one("span.rank_wrap > span.down").text
elif tr.select_one("span.rank_wrap > span.up"):
diff = "-"+tr.select_one("span.rank_wrap > span.up").text
sql = f"""
INSERT INTO melon VALUES(NULL, '{rank}', '{title}', '{singer}', '{album}', 0, '{diff}', '{str(time).zfill(2)}')
"""
cursor.execute(sql)
ws.append([rank, title, singer, album, 0, diff, str(time).zfill(2)])
db.commit()
db.close()
wb.save("melon.xlsx")
각 가수별로 TOP100에 올라간 곡 수, 가수명을 출력하고 곡이 많은 순서대로 정렬해서 출력
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
## 각 가수별로 TOP100에 올라간 곡 수, 가수명을 출력하고 곡이 많은 순서대로 정렬해서 출력
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()
sql = """
SELECT time, singer, COUNT(title) as '곡 수'
FROM melon
GROUP BY time, singer
ORDER BY time, COUNT(title) desc;
"""
cursor.execute(sql)
result = cursor.fetchmany(size=100)
for data in result:
print(data)
db.close()