C#使用phantomjs采集页面HTML

code.js

phantom.outputEncoding="gb2312"
system = require('system')   
address = system.args[1];//获得命令行第二个参数 接下来会用到   
//console.log('Loading a web page');   
var page = require('webpage').create();   
var url = address;   
//console.log(url);   
page.open(url, function (status) {   
    //Page is loaded!   
    if (status !== 'success') {   
        console.log('Unable to post!');   
    } else {   
        //console.log(page.content);   
        //var title = page.evaluate(function() {   
        //  return document.title;//示范下如何使用页面的jsapi去操作页面的  
        //  });   
        //console.log(title);   
        //console.log(encodeURIComponent(page.content));   
        console.log(page.content);   
    }     
    phantom.exit();   
});

C#代码

public string getAjaxCotnent(String url)
{
    var b = System.Reflection.Assembly.GetEntryAssembly().Location;
    var path = Path.GetDirectoryName(b); // 末尾不带斜杠

    ProcessStartInfo start = new ProcessStartInfo(path + "//phantomjs.exe");

    start.Arguments = "code.js" + " " + url;//设置命令参数
    StringBuilder sb = new StringBuilder();
    start.CreateNoWindow = false;//不显示dos命令行窗口
    start.RedirectStandardOutput = true;//
    start.RedirectStandardInput = true;//
    start.UseShellExecute = false;//是否指定操作系统外壳进程启动程序
    Process p = Process.Start(start);

   // string encoding = p.StandardOutput.CurrentEncoding.ToString();
    StreamReader reader = p.StandardOutput;//截取输出流
    string line = reader.ReadLine();//每次读取一行
    sb.AppendLine(line);
    while (!reader.EndOfStream)
    {
        line = reader.ReadLine();
        sb.AppendLine(line);
    }
    p.WaitForExit();//等待程序执行完退出进程
    p.Close();//关闭进程
    reader.Close();//关闭流
    string strRet = System.Web.HttpUtility.UrlDecode(sb.ToString());
    return strRet;
}

调用

var url = "http://po.baidu.com/feed/share?context=%7B%22nid%22%3A%22news_3493960622142783493%22%2C%22sourceFrom%22%3A%22bjh%22%7D";
var detailHtml = getAjaxCotnent(url);
本文为“老吴笔记”的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注