ChungHa Brocher_4 : 2021.06.05 Pupeteer Melon Albums Scrapping


Status Quo


After learning how to scrap the web pages with Node.js
Tried to scrap album, songs of artist 'Chungha'

Why use Pupeteer ? For Pagination


In general, people say we have to use Pupeteer as a last resort
In the beginning, I tried to scrap just using "cherrio"and "request"
But, I saw that pagination scrapping was not working
even if I changed the param 'startIdx'
After applying 'pupeteer', it worked
url : https://www.melon.com/artist/album.htm?artistId=968265#params%5BlistType%5D=0¶ms%5BorderBy%5D=ISSUE_DATE¶ms%5BartistId%5D=968265&po=pageObj&startIndex=1


Code

const request            = require('request-promise')
const mongoose           = require('mongoose')
const cherrio            = require('cheerio')
const AlbumModel         = require('../model/Albums')
const SongModel          = require('../model/Songs')
const Nightmare          = require('nightmare')
const nightmare          = Nightmare({show : true})
const puppeteer          = require('puppeteer')
const path               = require('path')
const {connectToMongoDb} = require('../db/db')    
const {sleep} = require('../utils/index')    
const dotenv             = require('dotenv')
dotenv.config({path:path.join(__dirname,'../../.env')})

let browser;
let melonScrappedResults = []

async function scrapeAlbumDescription(pgIdx,page){
    try {
        let url =`https://www.melon.com/artist/album.htm?artistId=968265#params%5BlistType%5D=0&params%5BorderBy%5D=ISSUE_DATE&params%5BartistId%5D=968265&po=pageObj&startIndex=${pgIdx}`
        await page.goto(url,{waitUntil : 'networkidle2'})
        const htmlRequest = await page.evaluate(()=>document.body.innerHTML)
        const $ = await cherrio.load(htmlRequest)
        const scrapResults = []
        
        $('.title_atist > .none').remove()
        const artistNm = $('.title_atist').text()
        $(".album11_ul > li").each((idx,elem)=>{
            const albumImg        = $(elem).find('img').attr('src')
            // console.log("albumImg",albumImdg)    
            const albumId         = $(elem).find('.wrap_album04 > a.thumb').attr('href').match(/\d+/g)[0]
            // console.log("albumId",albumId)
            const albumUrl        = `https://www.melon.com/album/detail.htm?albumId=${albumId}`
            // console.log("albumUrl",albumUrl)
            const albumType       = $(elem).find('.vdo_name').text()
            // console.log("albumType",albumType)
            const albumName       = $(elem).find('dt > .ellipsis').text().trim().replace(/\s/g,'')
            // console.log("albumName",albumName)
            const albumArtistName      = $(elem).find('.checkEllipsis > .play_artist').text().trim() 
            // console.log("artistName",artistName)
            const titleSong       = $(elem).find('.btn_play_song > .songname12').text() 
            // console.log("titleSong",titleSong)
            const albumOpenDate   = $(elem).find('.wrap_btn > .cnt_view').text() 
            // console.log("albumOpDate",albumOpenDate)
            const songNums        = $(elem).find('.wrap_btn > .tot_song').text().slice(0,-1) 
            // console.log("songNums",songNums)
            const scrapResult = {
                artistNm,albumImg,albumId,albumUrl,
                albumType,albumName,albumArtistName,
                titleSong,albumOpenDate,songNums,songs:[]
            }
            scrapResults.push(scrapResult)
        })

        return [scrapResults,artistNm]
    } catch (error) {
        console.error(error)
    }
}

const scrapSongDesicription = async (albumResults,artistNm) => {
    return await Promise.all(
        albumResults.map(async album =>{
            const htmlResult = await request.get(album.albumUrl)
            const $ = await cherrio.load(htmlResult)
            
            const songLists = []
            $('tr').each((idx,elem)=>{
                if(idx > 0){
                    // 해당 아티스트가 부른 ost 곡만 넣기
                    const songArtist = $(elem).find('.checkEllipsis').text()
                    if(!songArtist.includes(artistNm)) return 
                    const songTitle  = $(elem).find('.ellipsis:nth-child(1) > span > a').text()
                    const songId     = album.albumName + '_' + songTitle
                    console.log("songTitle",songTitle)
                    const songLikes  = $(elem).find('td:nth-child(5) > div > button > span.cnt').text().trim().match(/\d+/g)[0]
                    $('.button_etc.like > span.cnt > span').remove()
                    const song = {songId,songTitle, songArtist,songLikes,album:album.albumName}
                    songLists.push(song)
                }
            })
            album.songs = songLists
            return album
        })
    )
}

const insertSongInMongoDB = async (songArray) => {
    const songs = songArray.map(async song => {
        const songFromDb = await SongModel.findOne({songId:song.songId})
        if(!songFromDb){
            const newSong = new SongModel(song)
            return newSong.save()
        }
    })
    await Promise.all(songs)
}

const insertAlbumInMongoDB = async (albumArray) => {
    const albums = albumArray.map(async album => {
        const albumFromDb = await AlbumModel.findOne({albumId:album.albumId})
        console.log("album save ongoing")
        if(!albumFromDb){
            const newAlbum = new AlbumModel(album)
            await insertSongInMongoDB(album.songs)
            return newAlbum.save()
        }
    })
    await Promise.all(albums)
}

const scrapeAlbumLists = async () => {
    browser = await puppeteer.launch({headless: false});
    const albumPage = await browser.newPage()
    await connectToMongoDb()
    for(let pgIdx = 1 ; pgIdx < 47; pgIdx = pgIdx + 15){
        const [albumResults,artistNm] = await scrapeAlbumDescription(pgIdx,albumPage);
        const albumsFullData    = await scrapSongDesicription(albumResults,artistNm)
        melonScrappedResults = [...melonScrappedResults,...albumsFullData]
        console.log("\n")
        await insertAlbumInMongoDB(albumsFullData)
        await sleep(1000)
    }
    mongoose.disconnect();
    console.log("album save complete !!")
    browser.close()
    console.log(melonScrappedResults)
}

// scrapeAlbumLists()
console.log("melonScrappedResults",melonScrappedResults)
module.exports = {melonScrappedResults}