ChungHa Brocher_4 : 2021.06.05 Pupeteer Melon Albums Scrapping
7349 ワード
Status Quo
After learning how to scrap the web pages with Node.js
Tried to scrap album, songs of artist 'Chungha'
Why use Pupeteer ? For Pagination
In general, people say we have to use Pupeteer as a last resort
In the beginning, I tried to scrap just using "cherrio"and "request"
But, I saw that pagination scrapping was not working
even if I changed the param 'startIdx'
After applying 'pupeteer', it worked
url : https://www.melon.com/artist/album.htm?artistId=968265#params%5BlistType%5D=0¶ms%5BorderBy%5D=ISSUE_DATE¶ms%5BartistId%5D=968265&po=pageObj&startIndex=1
Code
const request = require('request-promise')
const mongoose = require('mongoose')
const cherrio = require('cheerio')
const AlbumModel = require('../model/Albums')
const SongModel = require('../model/Songs')
const Nightmare = require('nightmare')
const nightmare = Nightmare({show : true})
const puppeteer = require('puppeteer')
const path = require('path')
const {connectToMongoDb} = require('../db/db')
const {sleep} = require('../utils/index')
const dotenv = require('dotenv')
dotenv.config({path:path.join(__dirname,'../../.env')})
let browser;
let melonScrappedResults = []
async function scrapeAlbumDescription(pgIdx,page){
try {
let url =`https://www.melon.com/artist/album.htm?artistId=968265#params%5BlistType%5D=0¶ms%5BorderBy%5D=ISSUE_DATE¶ms%5BartistId%5D=968265&po=pageObj&startIndex=${pgIdx}`
await page.goto(url,{waitUntil : 'networkidle2'})
const htmlRequest = await page.evaluate(()=>document.body.innerHTML)
const $ = await cherrio.load(htmlRequest)
const scrapResults = []
$('.title_atist > .none').remove()
const artistNm = $('.title_atist').text()
$(".album11_ul > li").each((idx,elem)=>{
const albumImg = $(elem).find('img').attr('src')
// console.log("albumImg",albumImdg)
const albumId = $(elem).find('.wrap_album04 > a.thumb').attr('href').match(/\d+/g)[0]
// console.log("albumId",albumId)
const albumUrl = `https://www.melon.com/album/detail.htm?albumId=${albumId}`
// console.log("albumUrl",albumUrl)
const albumType = $(elem).find('.vdo_name').text()
// console.log("albumType",albumType)
const albumName = $(elem).find('dt > .ellipsis').text().trim().replace(/\s/g,'')
// console.log("albumName",albumName)
const albumArtistName = $(elem).find('.checkEllipsis > .play_artist').text().trim()
// console.log("artistName",artistName)
const titleSong = $(elem).find('.btn_play_song > .songname12').text()
// console.log("titleSong",titleSong)
const albumOpenDate = $(elem).find('.wrap_btn > .cnt_view').text()
// console.log("albumOpDate",albumOpenDate)
const songNums = $(elem).find('.wrap_btn > .tot_song').text().slice(0,-1)
// console.log("songNums",songNums)
const scrapResult = {
artistNm,albumImg,albumId,albumUrl,
albumType,albumName,albumArtistName,
titleSong,albumOpenDate,songNums,songs:[]
}
scrapResults.push(scrapResult)
})
return [scrapResults,artistNm]
} catch (error) {
console.error(error)
}
}
const scrapSongDesicription = async (albumResults,artistNm) => {
return await Promise.all(
albumResults.map(async album =>{
const htmlResult = await request.get(album.albumUrl)
const $ = await cherrio.load(htmlResult)
const songLists = []
$('tr').each((idx,elem)=>{
if(idx > 0){
// 해당 아티스트가 부른 ost 곡만 넣기
const songArtist = $(elem).find('.checkEllipsis').text()
if(!songArtist.includes(artistNm)) return
const songTitle = $(elem).find('.ellipsis:nth-child(1) > span > a').text()
const songId = album.albumName + '_' + songTitle
console.log("songTitle",songTitle)
const songLikes = $(elem).find('td:nth-child(5) > div > button > span.cnt').text().trim().match(/\d+/g)[0]
$('.button_etc.like > span.cnt > span').remove()
const song = {songId,songTitle, songArtist,songLikes,album:album.albumName}
songLists.push(song)
}
})
album.songs = songLists
return album
})
)
}
const insertSongInMongoDB = async (songArray) => {
const songs = songArray.map(async song => {
const songFromDb = await SongModel.findOne({songId:song.songId})
if(!songFromDb){
const newSong = new SongModel(song)
return newSong.save()
}
})
await Promise.all(songs)
}
const insertAlbumInMongoDB = async (albumArray) => {
const albums = albumArray.map(async album => {
const albumFromDb = await AlbumModel.findOne({albumId:album.albumId})
console.log("album save ongoing")
if(!albumFromDb){
const newAlbum = new AlbumModel(album)
await insertSongInMongoDB(album.songs)
return newAlbum.save()
}
})
await Promise.all(albums)
}
const scrapeAlbumLists = async () => {
browser = await puppeteer.launch({headless: false});
const albumPage = await browser.newPage()
await connectToMongoDb()
for(let pgIdx = 1 ; pgIdx < 47; pgIdx = pgIdx + 15){
const [albumResults,artistNm] = await scrapeAlbumDescription(pgIdx,albumPage);
const albumsFullData = await scrapSongDesicription(albumResults,artistNm)
melonScrappedResults = [...melonScrappedResults,...albumsFullData]
console.log("\n")
await insertAlbumInMongoDB(albumsFullData)
await sleep(1000)
}
mongoose.disconnect();
console.log("album save complete !!")
browser.close()
console.log(melonScrappedResults)
}
// scrapeAlbumLists()
console.log("melonScrappedResults",melonScrappedResults)
module.exports = {melonScrappedResults}
Reference
この問題について(ChungHa Brocher_4 : 2021.06.05 Pupeteer Melon Albums Scrapping), 我々は、より多くの情報をここで見つけました https://velog.io/@dhsys112/ChungHa-Brocher3-2021.06.05-Pupeteer-Melon-Albums-Scrappingテキストは自由に共有またはコピーできます。ただし、このドキュメントのURLは参考URLとして残しておいてください。
Collection and Share based on the CC Protocol