博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
HTMLHelper
阅读量:6116 次
发布时间:2019-06-21

本文共 12084 字,大约阅读时间需要 40 分钟。

using System;

using System.Text;
using System.Net;
using System.IO;
using System.Threading;
using System.Text.RegularExpressions;

public class HTMLHelper

{
    #region 私有字段
    private static CookieContainer cc = new CookieContainer();
    private static string contentType = "application/x-www-form-urlencoded";
    private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
    private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
    private static Encoding encoding = Encoding.GetEncoding("utf-8");
    private static int delay = 1000;
    private static int maxTry = 300;
    private static int currentTry = 0;
    #endregion

    #region 公有属性

    /// <summary>
    /// Cookie
    /// </summary>
    public static CookieContainer CookieContainer
    {
        get
        {
            return cc;
        }
    }

    /// <summary>

    /// 语言
    /// </summary>
    public static Encoding Encoding
    {
        get
        {
            return encoding;
        }
        set
        {
            encoding = value;
        }
    }

    public static int NetworkDelay

    {
        get
        {
            Random r = new Random();
            return (r.Next(delay, delay * 2));
        }
        set
        {
            delay = value;
        }
    }

    public static int MaxTry

    {
        get
        {
            return maxTry;
        }
        set
        {
            maxTry = value;
        }
    }
    #endregion

    #region 获取HTML

    /// <summary>
    /// 获取HTML
    /// </summary>
    /// <param name="url">地址</param>
    /// <param name="postData">post 提交的字符串</param>
    /// <param name="isPost">是否是post</param>
    /// <param name="cookieContainer">CookieContainer</param>
    public static string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
    {
        if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);
        Thread.Sleep(NetworkDelay);
        currentTry++;
        HttpWebRequest httpWebRequest = null;
        HttpWebResponse httpWebResponse = null;
        try
        {
            byte[] byteRequest = Encoding.Default.GetBytes(postData);
            httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
            httpWebRequest.CookieContainer = cookieContainer;
            httpWebRequest.ContentType = contentType;
            httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
            httpWebRequest.Referer = url;
            httpWebRequest.Accept = accept;
            httpWebRequest.UserAgent = userAgent;
            httpWebRequest.Method = isPost ? "POST" : "GET";
            httpWebRequest.ContentLength = byteRequest.Length;
            Stream stream = httpWebRequest.GetRequestStream();
            stream.Write(byteRequest, 0, byteRequest.Length);
            stream.Close();
            httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
            Stream responseStream = httpWebResponse.GetResponseStream();
            StreamReader streamReader = new StreamReader(responseStream, encoding);
            string html = streamReader.ReadToEnd();
            streamReader.Close();
            responseStream.Close();
            currentTry = 0;
            httpWebRequest.Abort();
            httpWebResponse.Close();
            return html;
        }
        catch (Exception e)
        {
            if (currentTry <= maxTry) GetHtml(url, postData, isPost, cookieContainer);
            currentTry--;
            if (httpWebRequest != null)  httpWebRequest.Abort();
            if (httpWebResponse != null) httpWebResponse.Close();
            return string.Empty;
        }
    }

    /// <summary>

    /// 获取HTML
    /// </summary>
    /// <param name="url">地址</param>
    /// <param name="cookieContainer">CookieContainer</param>
    public static string GetHtml(string url, CookieContainer cookieContainer)
    {
        Thread.Sleep(NetworkDelay);
        currentTry++;
        HttpWebRequest httpWebRequest = null;
        HttpWebResponse httpWebResponse = null;
        try
        {
            httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
            httpWebRequest.CookieContainer = cookieContainer;
            httpWebRequest.ContentType = contentType;
            httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
            httpWebRequest.Referer = url;
            httpWebRequest.Accept = accept;
            httpWebRequest.UserAgent = userAgent;
            httpWebRequest.Method = "GET";
            httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
            Stream responseStream = httpWebResponse.GetResponseStream();
            StreamReader streamReader = new StreamReader(responseStream, encoding);
            string html = streamReader.ReadToEnd();
            streamReader.Close();
            responseStream.Close();
            currentTry--;
            httpWebRequest.Abort();
            httpWebResponse.Close();
            return html;
        }
        catch (Exception e)
        {
            if (currentTry <= maxTry) GetHtml(url, cookieContainer);
            currentTry--;
            if (httpWebRequest != null)  httpWebRequest.Abort();
            if (httpWebResponse != null) httpWebResponse.Close();
            return string.Empty;
        }
    }
    #endregion

    #region 获取字符流

    /// <summary>
    /// 获取字符流
    /// </summary>
    //---------------------------------------------------------------------------------------------------------------
    // 示例:
    // System.Net.CookieContainer cookie = new System.Net.CookieContainer();
    // Stream s = HttpHelper.GetStream("", cookie);
    // picVerify.Image = Image.FromStream(s);
    //---------------------------------------------------------------------------------------------------------------
    /// <param name="url">地址</param>
    /// <param name="cookieContainer">cookieContainer</param>
    public static Stream GetStream(string url, CookieContainer cookieContainer)
    {
        currentTry++;

        HttpWebRequest httpWebRequest = null;

        HttpWebResponse httpWebResponse = null;

        try

        {
            httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
            httpWebRequest.CookieContainer = cookieContainer;
            httpWebRequest.ContentType = contentType;
            httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
            httpWebRequest.Referer = url;
            httpWebRequest.Accept = accept;
            httpWebRequest.UserAgent = userAgent;
            httpWebRequest.Method = "GET";

            httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();

            Stream responseStream = httpWebResponse.GetResponseStream();
            currentTry--;
            return responseStream;
        }
        catch (Exception e)
        {
            if (currentTry <= maxTry)
            {
                GetHtml(url, cookieContainer);
            }

            currentTry--;

            if (httpWebRequest != null)

            {
                httpWebRequest.Abort();
            } if (httpWebResponse != null)
            {
                httpWebResponse.Close();
            }
            return null;
        }
    }
    #endregion

    #region 清除HTML标记

    ///<summary>  
    ///清除HTML标记  
    ///</summary>  
    ///<param name="NoHTML">包括HTML的源码</param>  
    ///<returns>已经去除后的文字</returns>  
    public static string NoHTML(string Htmlstring)
    {
        //删除脚本  
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

        //删除HTML  

        Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
        Htmlstring = regex.Replace(Htmlstring, "");
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

        Htmlstring.Replace("<", "");

        Htmlstring.Replace(">", "");
        Htmlstring.Replace("\r\n", "");

        return Htmlstring;

    }
    #endregion

    #region 匹配页面的链接

    /// <summary>
    /// 获取页面的链接正则
    /// </summary>
    public string GetHref(string HtmlCode)
    {
        string MatchVale = "";
        string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
        foreach (Match m in Regex.Matches(HtmlCode, Reg))
        {
            MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
        }
        return MatchVale;
    }
    #endregion

    #region 匹配页面的图片地址

    /// <summary>
    /// 匹配页面的图片地址
    /// </summary>
    /// <param name="imgHttp">要补充的http://路径信息</param>
    public string GetImgSrc(string HtmlCode, string imgHttp)
    {
        string MatchVale = "";
        string Reg = @"<img.+?>";
        foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
        {
            MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
        }

        return MatchVale;

    }

    /// <summary>

    /// 匹配<img src="" />中的图片路径实际链接
    /// </summary>
    /// <param name="ImgString"><img src="" />字符串</param>
    public string GetImg(string ImgString, string imgHttp)
    {
        string MatchVale = "";
        string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
        foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
        {
            MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
        }
        if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
            return (MatchVale);
        else
            return (imgHttp + MatchVale);
    }
    #endregion

    #region 抓取远程页面内容

    /// <summary>
    /// 以GET方式抓取远程页面内容
    /// </summary>
    public static string Get_Http(string tUrl)
    {
        string strResult;
        try
        {
            HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(tUrl);
            hwr.Timeout = 19600;
            HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
            Stream myStream = hwrs.GetResponseStream();
            StreamReader sr = new StreamReader(myStream, Encoding.Default);
            StringBuilder sb = new StringBuilder();
            while (-1 != sr.Peek())
            {
                sb.Append(sr.ReadLine() + "\r\n");
            }
            strResult = sb.ToString();
            hwrs.Close();
        }
        catch (Exception ee)
        {
            strResult = ee.Message;
        }
        return strResult;
    }

    /// <summary>

    /// 以POST方式抓取远程页面内容
    /// </summary>
    /// <param name="postData">参数列表</param>
    public static string Post_Http(string url, string postData, string encodeType)
    {
        string strResult = null;
        try
        {
            Encoding encoding = Encoding.GetEncoding(encodeType);
            byte[] POST = encoding.GetBytes(postData);
            HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
            myRequest.Method = "POST";
            myRequest.ContentType = "application/x-www-form-urlencoded";
            myRequest.ContentLength = POST.Length;
            Stream newStream = myRequest.GetRequestStream();
            newStream.Write(POST, 0, POST.Length); //设置POST
            newStream.Close();
            HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
            StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
            strResult = reader.ReadToEnd();
        }
        catch (Exception ex)
        {
            strResult = ex.Message;
        }
        return strResult;
    }
    #endregion

    #region 压缩HTML输出

    /// <summary>
    /// 压缩HTML输出
    /// </summary>
    public static string ZipHtml(string Html)
    {
        Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
        Html = Regex.Replace(Html, @"\r\n\s*", "");
        Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
        return Html;
    }
    #endregion

    #region 过滤指定HTML标签

    /// <summary>
    /// 过滤指定HTML标签
    /// </summary>
    /// <param name="s_TextStr">要过滤的字符</param>
    /// <param name="html_Str">a img p div</param>
    public static string DelHtml(string s_TextStr, string html_Str)
    {
        string rStr = "";
        if (!string.IsNullOrEmpty(s_TextStr))
        {
            rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);
            rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);
        }
        return rStr;
    }
    #endregion

    #region 加载文件块

    /// <summary>
    /// 加载文件块
    /// </summary>
    public static string File(string Path, System.Web.UI.Page p)
    {
        return @p.ResolveUrl(Path);
    }
    #endregion

    #region 加载CSS样式文件

    /// <summary>
    /// 加载CSS样式文件
    /// </summary>
    public static string CSS(string cssPath, System.Web.UI.Page p)
    {
        return @"<link href=""" + p.ResolveUrl(cssPath) + @""" rel=""stylesheet"" type=""text/css"" />" + "\r\n";
    }
    #endregion

    #region 加载JavaScript脚本文件

    /// <summary>
    /// 加载javascript脚本文件
    /// </summary>
    public static string JS(string jsPath, System.Web.UI.Page p)
    {
        return @"<script type=""text/javascript"" src=""" + p.ResolveUrl(jsPath) + @"""></script>" + "\r\n";
    }
    #endregion
}

转载于:https://www.cnblogs.com/mynameltg/p/4043477.html

你可能感兴趣的文章
linux文本模式和文本替换功能
查看>>
Windows SFTP 的安装
查看>>
摄像机与绕任意轴旋转
查看>>
rsync 服务器配置过程
查看>>
预处理、const与sizeof相关面试题
查看>>
爬虫豆瓣top250项目-开发文档
查看>>
Elasticsearch增删改查
查看>>
oracle归档日志增长过快处理方法
查看>>
有趣的数学书籍
查看>>
teamviewer 卸载干净
查看>>
多线程设计模式
查看>>
解读自定义UICollectionViewLayout--感动了我自己
查看>>
SqlServer作业指定目标服务器
查看>>
UnrealEngine4.5 BluePrint初始化中遇到编译警告的解决办法
查看>>
User implements HttpSessionBindingListener
查看>>
抽象工厂方法
查看>>
ubuntu apt-get 安装 lnmp
查看>>
焊盘 往同一个方向增加 固定的长度方法 总结
查看>>
eclipse的maven、Scala环境搭建
查看>>
架构师之路(一)- 什么是软件架构
查看>>