本文最后更新于2021-11-24,已超过 1年没有更新,如果文章内容、图片或者下载资源失效,请留言反馈,我会及时处理,谢谢!
温馨提示:本文共5532个字,读完预计14分钟。
开门见山的说,我是在GitHub上拿的代码,但由于那个项目年代比较久远了,公众号的一些规则变了,不能用了,所以我就改了部分代码,达到直接用的地步。
功能:根据微信公众号链接,爬取文章的文字和图片(下载到本地),以html形式保存。
直接贴代码,就一个文件,可以直接用
程序代码如下:
<?php namespace WxCrawler; /** * 微信公众号文章爬取类 */ class WxCrawler { //微信内容div正则 private $wxContentDiv = '/<div class="rich_media_content " id="js_content" style="visibility: hidden;">(.*?)<\/div>/s'; //微信图片样式 private $imageStyle = 'style="max-width: 677px !important;height: auto !important;visibility: visible !important;"'; /** * 爬取内容 * @param $url * @return false|string * @author bignerd * @since 2016-08-16T10:13:58+0800 */ private function _get($url) { return file_get_contents($url); } public function crawByUrl($url) { $content = $this->_get($url); $basicInfo = $this->articleBasicInfo($content); list($content_html, $content_text) = $this->contentHandle($content); return array_merge($basicInfo,['content_html' => $content_html,'content_text' => $content_text]); } /** * 处理微信文章源码,提取文章主体,处理图片链接 * @author bignerd * @since 2016-08-16T15:59:27+0800 * @param $content 抓取的微信文章源码 * @return [带图html文本,无图html文本] */ private function contentHandle($content) { $content_html_pattern = $this->wxContentDiv; preg_match_all($content_html_pattern, $content, $html_matchs); if(empty(array_filter($html_matchs))) { echo '文章不存在'; exit(); } $content_html = $html_matchs[0][0]; //去除掉hidden隐藏 $content_html = str_replace('style="visibility: hidden;"','',$content_html); //过滤掉iframe $content_html = preg_replace('/<iframe(.*?)<\/iframe>/','',$content_html); $path = 'article/'; /** @var 带图片html文本 */ $content_html = preg_replace_callback('/data-src="(.*?)"/', function($matches) use ($path){ return 'src="' . $path . $this->getImg($matches[1]).'" '.$this->imageStyle; }, $content_html); //添加微信样式 $content_html = '<div style="max-width: 677px;margin-left: auto;margin-right: auto;">'.$content_html. '</div>'; /** @var 无图html文本 */ $content_text = preg_replace('/<img.*?>/s','',$content_html); return [$content_html,$content_text]; } /** * 获取文章的基本信息 * @author bignerd * @since 2016-08-16T17:16:32+0800 * @param $content 文章详情源码 * @return $basicInfo */ private function articleBasicInfo($content) { //待获取item $item = [ 'ct' => 'date',//发布时间 'msg_title' => 'title',//标题 'msg_desc' => 'digest',//描述 'msg_link' => 'content_url',//文章链接 'msg_cdn_url' => 'cover',//封面图片链接 'nickname' => 'wechatname',//公众号名称 ]; $basicInfo = [ 'author' => '', 'copyright_stat' => '', ]; foreach ($item as $k => $v) { if($k == 'msg_title') $pattern = '/ var '.$k.' = (.*?)\.html\(false\);/s'; else $pattern = '/ var '.$k.' = "(.*?)";/s'; preg_match_all($pattern,$content,$matches); if(array_key_exists(1, $matches) && !empty($matches[1][0])){ $basicInfo[$v] = $this->htmlTransform($matches[1][0]); }else{ $basicInfo[$v] = ''; } } //2020/4/3获取作者已失效 // /** 获取作者 */ // preg_match('/<em class="rich_media_meta rich_media_meta_text">(.*?)<\/em>/s', $content, $matchAuthor); // if(!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1]; // /** 文章类型 */ // preg_match('/<span id="copyright_logo" class="rich_media_meta meta_original_tag">(.*?)<\/span>/s', $content, $matchType); // if(!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1]; return $basicInfo; } /** * 特殊字符转换 * @author bignerd * @since 2016-08-16T17:30:52+0800 * @param $string * @return $string */ private function htmlTransform($string) { $string = str_replace('"','"',$string); $string = str_replace('&','&',$string); $string = str_replace('amp;','',$string); $string = str_replace('<','<',$string); $string = str_replace('>','>',$string); $string = str_replace(' ',' ',$string); $string = str_replace("\\", '',$string); return $string; } /** * @param $url * @return string */ private function getImg($url){ $refer = "http://www.qq.com/"; $opt = [ 'http'=>[ 'header'=>"Referer: " . $refer ] ]; $context = stream_context_create($opt); //接受数据流 $file_contents = file_get_contents($url,false, $context); $imageSteam = Imagecreatefromstring($file_contents); $path = 'article/'; if(!file_exists($path)) mkdir($path,0777,true); $fileName = time().rand(0,99999) . '.jpg'; //生成新图片 imagejpeg($imageSteam, $path . $fileName); return $fileName; } } $url = 'https://mp.weixin.qq.com/s/4gwonJ3m0wd-kwTA3SmU-g'; $crawler = new WxCrawler(); $content = $crawler->crawByUrl($url); echo $content['content_html'];
————————————————
版权声明:本文为CSDN博主「Me佳佳丶」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/q6627666/article/details/105432090