본문 바로가기

PYTHON/자동화

유튜버의 영상 제목 크롤링 / 영상 댓글 크롤링

728x90

텍스트마이닝 해보고 싶은데 아직 파싱도 제대로 못한다 ㅠㅠ
생각보다 진도가 너무 안나가는 중이라서 하나하나 이해못해도 그냥 따라만이라도 해봐야겠다.


import time  
from [selenium.webdriver.common.keys](selenium.webdriver.common.keys) import Keys  
from selenium import webdriver  
from bs4 import BeautifulSoup  
import urllib  

from [selenium.webdriver.common.action\_chains](selenium.webdriver.common.action_chains) import ActionChains  

driver = [webdriver.Chrome('C:](webdriver.Chrome('C:)\\[chromedriver.exe')](chromedriver.exe'))  
[driver.get(](driver.get()"[https://www.youtube.com/user/Sunbaaking/videos](https://www.youtube.com/user/Sunbaaking/videos)")  
driver.maximize\_window()  

source = driver.page\_source  
soup = BeautifulSoup(source,'[html.parser')](html.parser'))  

mainlink = "[https://www.youtube.com/](https://www.youtube.com/)"  
titles=\[\]  
links=\[\]  
SCROLL\_PAUSE\_TIME=20  

def scroll\_down\_end():  
    last\_height = driver.execute\_script("return [document.body.scrollHeight](document.body.scrollHeight)")  

    while True:  
        # 끝까지 스크롤 내리기  
        driver.execute\_script("window.scrollTo(0, [document.body.scrollHeight);](document.body.scrollHeight);)")  
        ActionChains(driver).key\_down([Keys.END).key\_up(Keys.END).perform()](Keys.END).key_up(Keys.END).perform())  

        # 쉬기  
        [time.sleep(SCROLL\_PAUSE\_TIME)](time.sleep(SCROLL_PAUSE_TIME))  

        # 불러올 스크롤이 없으면 그만하기  
        new\_height = driver.execute\_script("return [document.body.scrollHeight](document.body.scrollHeight)")  
        if new\_height == last\_height:  
            break  
        last\_height = new\_height  


# links=soup.findAll('a',id='video-title') 같은 코드  
[titles=soup.select(](titles=soup.select()"a\[id=video-title\]")  
for title in titles:  
    print([title.text)](title.text))  
    print([mainlink+title.get('href'))](mainlink+title.get('href')))  


title0=titles\[0\]  
[driver.get(mainlink+title0.get('href'))](driver.get(mainlink+title0.get('href')))  
#body=driver.find\_element\_by\_tag\_name('body')  
[#body.text](#body.text)  

scroll\_down\_end()  
print("스크롤 후")  



source = driver.page\_source  
soup = BeautifulSoup(source,'[html.parser')](html.parser'))  

[comments=soup.select(](comments=soup.select()"yt-formatted-string\[id=content-text\]")  
for comment in comments:  
    print([comment.text)](comment.text))  
print("끝")