library(rvest)
article_detail <- function(url){
raw_html <- read_html(url)
author_css <- ".article-metaline:nth-child(1) .article-meta-value"
title_css <- ".article-metaline-right+ .article-metaline .article-meta-value"
time_css <- ".article-metaline+ .article-metaline .article-meta-value"
main_content_css <- "#main-content"
push_css <- ".push-tag"
push_id_css <- ".push-userid"
push_content_css <- ".push-content"
push_time_css <- ".push-ipdatetime"
article_detail_info <- list()
columns <- c(author_css, title_css, time_css, main_content_css, push_css, push_id_css, push_content_css, push_time_css)
for (i in 1:length(columns)){
article_content <- raw_html %>%
html_nodes(css = columns[i]) %>%
html_text()
article_detail_info[[i]] <- article_content
}
names(article_detail_info) <- c("author", "title", "time", "main_content", "push", "push_id", "push_content", "push_time")
article_detail_info$main_content <- article_detail_info$main_content %>%
gsub(pattern = "\n", ., replacement = "") %>%
gsub(pattern = "--.+", ., replacement = "") %>%
gsub(pattern = "作者.+:[0-9]{2}\\s[0-9]{4}", ., replacement = "")
article_detail_info$push <- gsub(pattern = "\\s", article_detail_info$push, replacement = "")
article_detail_info$push_id <- gsub(pattern = "\\s", article_detail_info$push_id, replacement = "")
article_detail_info$push_content <- article_detail_info$push_content %>%
gsub(pattern = "\\s", ., replacement = "") %>%
gsub(pattern = ":", ., replacement = "")
article_detail_info$push_time <- article_detail_info$push_time %>%
gsub(pattern = "^\\s", ., replacement = "") %>%
gsub(pattern = "\n", ., replacement = "")
return(article_detail_info)
}