Home [SeSAC]파이썬 데이터 처리 프로그래밍-4일차
Post
Cancel

[SeSAC]파이썬 데이터 처리 프로그래밍-4일차

Selenium

네이버 주식 크롤링

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import requests
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

service = Service()
d = webdriver.Chrome(service=service)

try:
    url = 'https://finance.naver.com/'
    d.get(url)
    trs = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#_topItems1 > tr')))
    for tr in trs:
        updown = tr.get_attribute("class")
        if updown == "up":
            sign = "+"
        elif updown == "down":
            sign = "-"
        else: sign = ""
        item = tr.find_element(By.CSS_SELECTOR, "th > a").text.strip()
        tds = tr.find_elements(By.CSS_SELECTOR, "td")
        price = tds[0].text.strip()
        num = tds[1].text.strip().replace("\n", "")
        rate = tds[2].text.strip()
        print(f"{item}, {price}, {num}, {rate}")
except Exception as e:
    print(e)
finally:
    d.close()
    d.quit()

인스타그램

Selenium 정보 저장

  1. 빈 파일 생성
  2. 로그인정보를 저장하기 위해 추가된 코드
1
2
3
4
5
# 크롬에 정보 저장
options = webdriver.ChromeOptions()
options.add_argument("--user-data-dir={디렉토리 경로}")
service = Service()
d = webdriver.Chrome(service=service, options=options)
  1. 코드 중간에 input()을 넣어 두고 실행 (코드 일시정지 용도)
  2. 열린 크롬 창에서 인스타 로그인
  3. input()에 입력 하여 코드 종료

인스타 #python 크롤링

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import requests
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

# 크롬에 정보 저장
options = webdriver.ChromeOptions()
options.add_argument("--user-data-dir=/Users/syshin/Desktop/Syshin/SeSAC/파이썬 데이터 처리 프로그래밍/4일차-/MyChrome")
service = Service()
d = webdriver.Chrome(service=service, options=options)


try:
    url = 'https://www.instagram.com/'
    d.get(url)

    # 검색 버튼
    buttons = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".xvy4d1p")))
    # buttons = d.find_elements(By.CSS_SELECTOR, ".xvy4d1p")
    search_button = buttons[2]
    
    # 검색 버튼 클릭
    ac = ActionChains(d)
    ac.move_to_element(search_button).click().pause(1)
    ac.perform()
    
    elem = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".x7xwk5j")))
    ac.reset_actions()
    # ac.move_to_element(elem).click().send_keys("#파이썬")
    ac.send_keys_to_element(elem, "#파이썬")
    ac.pause(1)
    ac.move_by_offset(0,100)
    ac.click()
    ac.perform()
    
    posts = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "._aabd")))
    for post in posts:
        ac.reset_actions()
        ac.click(post)
        # ac.pause(1)
        ac.perform()
        content = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "._a9zs > ._aade")))
        print(content.text)
        ac.reset_actions()
        ac.send_keys(Keys.ESCAPE)
        ac.perform()

    
except Exception as e:
    print(e)
finally:
    time.sleep(3)
    d.close()
    d.quit()
    

인스타 크롤링 #고양이 검색, 엑셀에 저장 실습

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from selenium import webdriver
from openpyxl import Workbook
from selenium.webdriver import ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

wb = Workbook()
ws = wb.active

options = ChromeOptions()

# options.add_argument("--headless") # 브라우저 안뜬 상태로 진행
# options.add_argument("--window-size=1920,1080)") # 브라우저 사이즈 조절
options.add_argument("--user-data-dir=/Users/syshin/Desktop/Syshin/SeSAC/파이썬 데이터 처리 프로그래밍/4일차-/MyChrome")
service = Service()
d = webdriver.Chrome(service=service, options=options)
url = "https://www.instagram.com"

try:
    d.get(url)
    buttons = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.xvy4d1p')))
    search_button = buttons[2]
    
    ac = ActionChains(d)
    ac.move_to_element(search_button).click().pause(1).perform()
    
    elem = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input.x7xwk5j")))
    
    ac.reset_actions()
    ac.send_keys_to_element(elem, "#고양이").pause(2)
    ac.send_keys(Keys.TAB, Keys.TAB, Keys.ENTER).perform()
    
    ac = ActionChains(d)
    posts = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div._al3l > a._a6hd")))

    for post in posts:
        ac.reset_actions()
        ac.click(post).perform()
        content = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div._a9zs> h1._aade"))).text
        ac.reset_actions()
        ws.append([content])
        ac.send_keys(Keys.ESCAPE).perform()
except Exception as e:
    print(e)
    
finally:
    time.sleep(3)
    d.close()
    d.quit()
    wb.save("instagram.xlsx")

미니프로젝트-멜론, 인스타에서 TOP 100 가수,곡 정보 크롤링, DB 저장

Except 위치 나타내기

1
2
3
4
import traceback
try:
except:
	traceback.print_exc()

1) 가수 테이블 생성

  • id(Autoincrement 필드, PK)
  • name(가수이름)
  • follower(팔로우 수)

2) song 테이블 생성

  • id(Autoincrement, PK)
  • singer_id(FK)
  • title(제목)
  • album(앨범)

3) 멜론 TOP 100 수집

  • 멜론 TOP 100에 존재하는 가수들을 singer테이블에 저장
  • 각 가수의 곡들은 SONG 테이블에 저장
  • 단, singer테이블에 follower으로 0으로 저장

4) instagram에서 가수 검색 follower 수집

  • 단, 인증마크가 있는 가수만
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import traceback
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from selenium.webdriver.chrome.service import Service
import time

db = pymysql.connect(host="localhost", port=3306, user="root", password="{비밀번호}", db="sba")
cursor = db.cursor()

options = webdriver.ChromeOptions()
options.add_argument("--user-data-dir=/Users/syshin/Desktop/Syshin/SeSAC/파이썬 데이터 처리 프로그래밍/4일차-/MyChrome")
service = Service()
d = webdriver.Chrome(service=service, options=options)
url_melon = "https://www.melon.com/chart/index.htm"
url_instagram = "https://www.instagram.com"

try:
    # 멜론 TOP 100
    d.get(url_melon)
    trs = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#frm > div > table > tbody > tr")))
    already = []
    
    # DB 초기화
    sql_delete_song = 'DELETE FROM song;'
    cursor.execute(sql_delete_song)
    sql_delete_singer = 'DELETE FROM singer;'
    cursor.execute(sql_delete_singer)
    
    for tr in trs:
        # 곡, 가수, 앨범명 수집
        title = tr.find_element(By.CSS_SELECTOR, 'div.rank01 > span > a').text.strip().replace("'", '"')
        singer = tr.find_element(By.CSS_SELECTOR, 'div.ellipsis.rank02 > a').text.strip().replace("'", '"')
        album = tr.find_element(By.CSS_SELECTOR, 'div.ellipsis.rank03 > a').text.strip().replace("'", '"')
        
        # 쿼리 생성 (SINGER)
        if singer not in already:
            sql_singer = f"""
            INSERT INTO singer VALUES(NULL, '{singer}', 0)
            """
            cursor.execute(sql_singer)
            already.append(singer) # 중복 확인
        
        # 쿼리 생성 (SONG)
        sql_song = f"""
        INSERT INTO song VALUES(NULL, (SELECT id FROM singer WHERE name='{singer}' LIMIT 1), '{title}', '{album}')
        """    
        cursor.execute(sql_song)
    db.commit()
    
    # 가수명 DB에서 검색
    select_sql = "SELECT name FROM singer"
    cursor.execute(select_sql)
    singer_list = cursor.fetchmany(size=100)
    
    # 인스타 접속, 공식 계정의 팔로워 수 수집, DB에 저장
    d.get(url_instagram)
    authorized = []
    ac = ActionChains(d)
    
    # 검색 버튼
    buttons = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.xvy4d1p')))
    search_button = buttons[2]
    ac.move_to_element(search_button).click().pause(1).perform()
    
    
    first = True # 첫번째 검색시에만 검색버튼을 눌러줘야 하므로 첫번째인지 변수에 저장
    
    # 가수마다 공식계정 검색
    for singers in singer_list:
        for singer in singers:
            elem = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input.x7xwk5j")))
            ac.reset_actions()
            ac.send_keys_to_element(elem, singer).pause(2).perform()
            
            
            # 공식계정 있을 시 팔로워수 DB에 저장
            authorized = []
            authorized = d.find_elements(By.CSS_SELECTOR, 'div.x9f619.xjbqb8w.x1rg5ohu.x168nmei.x13lgxp2.x5pf9jr.xo71vjh.xsgj6o6.x1uhb9sk.x1plvlek.xryxfnj.x1c4vz4f.x2lah0s.xdt5ytf.xqjyukv.x1qjc9v5.x1oa3qoh.x1nhvcw1')
            ac.reset_actions()
            if authorized:
                ac.move_to_element(authorized[0]).click().pause(2).perform()
                followers = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'span._ac2a > span')))
                
                # DB 저장
                sql_update_singer = f"""
                    UPDATE singer SET follower = '{followers[1].text}' WHERE name = '{singer}'
                """
                cursor.execute(sql_update_singer)
                
                # 검색 버튼 다시 열기
                if first:
                    buttons = WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.xvy4d1p')))
                    search_button = buttons[2]
                    ac.reset_actions()
                    ac.move_to_element(search_button).click().pause(1).perform()
                    first = False
                else:
                    elem = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input.x7xwk5j")))
                    elem.clear()
                
            # 공식 계정이 없을 시, 검색창 비움
            else:
                time.sleep(1)
                elem = WebDriverWait(d, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input.x7xwk5j")))
                elem.clear()
except Exception as e:
    # print(e)
    traceback.print_exc()
finally:
    # websriver 종료
    d.close()
    d.quit()
    
    # DB 커밋, 종료
    db.commit()
    db.close()
This post is licensed under CC BY 4.0 by the author.

[자바의정석]2장. 변수

[SeSAC]파이썬 데이터 처리 프로그래밍-5일차