[SeSAC]파이썬 데이터 처리 프로그래밍-2일차(BeautifulSoup, Pymysql, Openpyxl)

BeautifulSoup

Naver 지식인 타이틀 페이지네이션

  
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

ua = UserAgent()
headers = {
    "User-Agent":ua.random
}
for i in range(1, 10):
    link = f'https://kin.naver.com/search/list.naver?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&page={i}'
    res = requests.get(link, headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')


    uls = bs.select_one('#s_content > div.section > ul')
    titles = uls.select('dt > a')
    for title in titles:
        print(title.text)
        print(title.attrs.get("href"))

Pymysql

database에 접속하기 위한 라이브러리

Pymysql을 이용한 데이터 조회

  
#!pip install pymysql
import pymysql

db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="market_db")
cursor = db.cursor()

sql = """
SELECT * FROM member;
"""

cursor.execute(sql)
result = cursor.fetchmany(size=100)
for data in result:
    print(data)
    
db.close()

db.close()를 항상 주의해줄것(주피터 노트북 환경이라 더욱 신경써줘야 한다)

Pymysql을 이용한 데이터 삽입

  
import pymysql

db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="market_db")
cursor = db.cursor()

sql = """
INSERT INTO member VALUES('ABC', '에이비씨', 10, '경기', '031', '11122233', 170, '2023-08-01');
"""

cursor.execute(sql)
db.commit()
db.close()

Workbench에서 새로운 스키마, 테이블 생성

실습: 네이버 뉴스 파이썬 검색, 여러 페이지 크롤링 후 database 삽입

  
# 네이버 뉴스 파이썬 검색, 여러 페이지 크롤링 후 database 삽입
import requests
import pymysql
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time

ua = UserAgent()
headers = {
    "User-Agent":ua.random
}

db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()

for i in range(10):
    link=f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=41&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={i*10+1}"
    res = requests.get(link, headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')
    
    try:
        print(i)
        ul = bs.select_one('#main_pack > section > div > div.group_news > ul')#main_pack > section > div > div.group_news > ul
        divs = ul.select('div.news_wrap.api_ani_send')
        for div in divs:
            title = div.select_one('a.news_tit').text.replace("'", '"')
            link = div.select_one('a.news_tit').attrs.get("href").replace("'", '"')
            content = div.select_one('a.dsc_txt_wrap').text.replace("'", '"')
            count = content.count('파이썬')
            sql = f"""
            INSERT INTO link VALUES (NULL, '{title}', '{link}', '파이썬', '{content}', {count}, '2022-08-01', now())
            """
            cursor.execute(sql)
        time.sleep(1)
    except:
        print("error", i)
db.commit()
db.close()

  
## 각 링크 안에 있는 내용을 읽어와서 content에 저장
# count도 실제 content에서 갯수를 세서 저장
# content가 너무 길다는 에러 메시지가 나온다면 content[:10000] 슬라이싱 해서 저장

# 네이버 뉴스에서 파이썬 검색시 1000 페이지 크롤링
import requests
import pymysql
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time

ua = UserAgent()
headers = {
    "User-Agent":ua.random
}

db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()

for i in range(10):
    url=f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=41&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={i*10+1}"
    res = requests.get(url, headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')
    
    try:
        print(i)
        ul = bs.select_one('#main_pack > section > div > div.group_news > ul')#main_pack > section > div > div.group_news > ul
        divs = ul.select('div.news_wrap.api_ani_send')
        for div in divs:
            title = div.select_one('a.news_tit').text.replace("'", '"')
            link = div.select_one('a.news_tit').attrs.get("href").replace("'", '"')
            headers = {"User-Agent":ua.random}
            content = requests.get(link, headers=headers).text[:10000].replace("'", '"')
            count = content.count('파이썬')
            sql = f"""
            INSERT INTO link VALUES (NULL, '{title}', '{link}', '파이썬2', '{content}', {count}, '2022-08-01', now())
            """
            cursor.execute(sql)
            time.sleep(0.1)
    except Exception as e:
        print("error", i, e)
db.commit()
db.close()

Openpyxl

한 셀에 값 입력

  
#pip install openpyxl
from openpyxl import Workbook

wb = Workbook()
ws = wb.active

ws["A1"] = "Test-Data"

wb.save("result.xlsx")

여러 셀에 값 입력

  
#pip install openpyxl
from openpyxl import Workbook

wb = Workbook()
ws = wb.active

for row in range(10):
    ws.append([row, f"{row}-data"])
wb.save("result.xlsx")

엑셀 읽어오기

  
#pip install openpyxl
from openpyxl import load_workbook

wb = load_workbook("result.xlsx")
ws = wb.active

for row in ws.iter_rows():
    print(row[0].value, row[1].value)

네이버 뉴스 크롤링해 엑셀로 저장

  
## 각 링크 안에 있는 내용을 읽어와서 content에 저장
# count도 실제 content에서 갯수를 세서 저장
# content가 너무 길다는 에러 메시지가 나온다면 content[:10000] 슬라이싱 해서 저장

# 네이버 뉴스에서 파이썬 검색시 1000 페이지 크롤링
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
from openpyxl import Workbook

wb = Workbook()
ws = wb.active

ua = UserAgent()
headers = {
    "User-Agent":ua.random
}

ws.append(["row", "title", "link", "content", "count"])

for i in range(10):
    url=f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=41&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={i*10+1}"
    res = requests.get(url, headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')
    
    try:
        print(i)
        ul = bs.select_one('#main_pack > section > div > div.group_news > ul')#main_pack > section > div > div.group_news > ul
        divs = ul.select('div.news_wrap.api_ani_send')
        for div in divs:
            title = div.select_one('a.news_tit').text.replace("'", '"')
            link = div.select_one('a.news_tit').attrs.get("href").replace("'", '"')
            headers = {"User-Agent":ua.random}
            content = requests.get(link, headers=headers).text[:10000].replace("'", '"')
            count = content.count('파이썬')
            ws.append([i, title, link, content, count])
        break
    except Exception as e:
        print("error", i, e)
wb.save("result.xlsx")

실습: 멜론차트 100위

멜론차트 100위 시간별로 db, excel에 입력

  
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pymysql
from openpyxl import Workbook

ua = UserAgent()
headers = {"User-Agent":ua.random}
# db 연결
db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()

# Excel 연결
wb = Workbook()
ws = wb.active

for time in range(17):
    url = f"https://www.melon.com/chart/index.htm?dayTime=20230801{str(time).zfill(2)}"
    res = requests.get(url, headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')

    tbody = bs.select("#frm > div > table > tbody > tr")
    for tr in tbody:
        rank = int(tr.select_one("span.rank").text.replace("'", '"'))
        title = tr.select_one("div.rank01 > span > a").text.replace("'", '"')
        singer = tr.select_one("div.rank02 > span > a").text.replace("'", '"')
        album = tr.select_one("div.rank03 > a").text.replace("'", '"')
        # like = tr.select_one("button > span.cnt")
        if tr.select_one("span.rank_wrap > span.none"):
            diff = "-"
        elif tr.select_one("span.rank_wrap > span.down"):
            diff = "+"+tr.select_one("span.rank_wrap > span.down").text
        elif tr.select_one("span.rank_wrap > span.up"):
            diff = "-"+tr.select_one("span.rank_wrap > span.up").text
        sql = f"""
        INSERT INTO melon VALUES(NULL, '{rank}', '{title}', '{singer}', '{album}', 0, '{diff}', '{str(time).zfill(2)}')
        """
        cursor.execute(sql)
        ws.append([rank, title, singer, album, 0, diff, str(time).zfill(2)])

db.commit()
db.close()
wb.save("melon.xlsx")

각 가수별로 TOP100에 올라간 곡 수, 가수명을 출력하고 곡이 많은 순서대로 정렬해서 출력

  
## 각 가수별로 TOP100에 올라간 곡 수, 가수명을 출력하고 곡이 많은 순서대로 정렬해서 출력

db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()
sql = """
SELECT time, singer, COUNT(title) as '곡 수'
FROM melon
GROUP BY time, singer
ORDER BY time, COUNT(title) desc;
"""
cursor.execute(sql)
result = cursor.fetchmany(size=100)
for data in result:
    print(data)
db.close()

[SeSAC]파이썬 데이터 처리 프로그래밍-2일차(BeautifulSoup, Pymysql, Openpyxl)

BeautifulSoup

Naver 지식인 타이틀 페이지네이션

Pymysql

Pymysql을 이용한 데이터 조회

Pymysql을 이용한 데이터 삽입

Workbench에서 새로운 스키마, 테이블 생성

실습: 네이버 뉴스 파이썬 검색, 여러 페이지 크롤링 후 database 삽입

Openpyxl

한 셀에 값 입력

여러 셀에 값 입력

엑셀 읽어오기

네이버 뉴스 크롤링해 엑셀로 저장

실습: 멜론차트 100위

멜론차트 100위 시간별로 db, excel에 입력

각 가수별로 TOP100에 올라간 곡 수, 가수명을 출력하고 곡이 많은 순서대로 정렬해서 출력

Further Reading

[SeSAC]파이썬 데이터 처리 프로그래밍-1일차(BeautifulSoup)

[SeSAC]파이썬 데이터 처리 프로그래밍-3일차

[SeSAC]파이썬 데이터 처리 프로그래밍-4일차