using System;
using System.Text; using System.Net; using System.IO; using System.Threading;using System.Text.RegularExpressions;public class HTMLHelper
{ #region 私有字段 private static CookieContainer cc = new CookieContainer(); private static string contentType = "application/x-www-form-urlencoded"; private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*"; private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)"; private static Encoding encoding = Encoding.GetEncoding("utf-8"); private static int delay = 1000; private static int maxTry = 300; private static int currentTry = 0; #endregion#region 公有属性
/// <summary> /// Cookie /// </summary> public static CookieContainer CookieContainer { get { return cc; } }/// <summary>
/// 语言 /// </summary> public static Encoding Encoding { get { return encoding; } set { encoding = value; } }public static int NetworkDelay
{ get { Random r = new Random(); return (r.Next(delay, delay * 2)); } set { delay = value; } }public static int MaxTry
{ get { return maxTry; } set { maxTry = value; } } #endregion#region 获取HTML
/// <summary> /// 获取HTML /// </summary> /// <param name="url">地址</param> /// <param name="postData">post 提交的字符串</param> /// <param name="isPost">是否是post</param> /// <param name="cookieContainer">CookieContainer</param> public static string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer) { if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer); Thread.Sleep(NetworkDelay); currentTry++; HttpWebRequest httpWebRequest = null; HttpWebResponse httpWebResponse = null; try { byte[] byteRequest = Encoding.Default.GetBytes(postData); httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url); httpWebRequest.CookieContainer = cookieContainer; httpWebRequest.ContentType = contentType; httpWebRequest.ServicePoint.ConnectionLimit = maxTry; httpWebRequest.Referer = url; httpWebRequest.Accept = accept; httpWebRequest.UserAgent = userAgent; httpWebRequest.Method = isPost ? "POST" : "GET"; httpWebRequest.ContentLength = byteRequest.Length; Stream stream = httpWebRequest.GetRequestStream(); stream.Write(byteRequest, 0, byteRequest.Length); stream.Close(); httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); Stream responseStream = httpWebResponse.GetResponseStream(); StreamReader streamReader = new StreamReader(responseStream, encoding); string html = streamReader.ReadToEnd(); streamReader.Close(); responseStream.Close(); currentTry = 0; httpWebRequest.Abort(); httpWebResponse.Close(); return html; } catch (Exception e) { if (currentTry <= maxTry) GetHtml(url, postData, isPost, cookieContainer); currentTry--; if (httpWebRequest != null) httpWebRequest.Abort(); if (httpWebResponse != null) httpWebResponse.Close(); return string.Empty; } }/// <summary>
/// 获取HTML /// </summary> /// <param name="url">地址</param> /// <param name="cookieContainer">CookieContainer</param> public static string GetHtml(string url, CookieContainer cookieContainer) { Thread.Sleep(NetworkDelay); currentTry++; HttpWebRequest httpWebRequest = null; HttpWebResponse httpWebResponse = null; try { httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url); httpWebRequest.CookieContainer = cookieContainer; httpWebRequest.ContentType = contentType; httpWebRequest.ServicePoint.ConnectionLimit = maxTry; httpWebRequest.Referer = url; httpWebRequest.Accept = accept; httpWebRequest.UserAgent = userAgent; httpWebRequest.Method = "GET"; httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); Stream responseStream = httpWebResponse.GetResponseStream(); StreamReader streamReader = new StreamReader(responseStream, encoding); string html = streamReader.ReadToEnd(); streamReader.Close(); responseStream.Close(); currentTry--; httpWebRequest.Abort(); httpWebResponse.Close(); return html; } catch (Exception e) { if (currentTry <= maxTry) GetHtml(url, cookieContainer); currentTry--; if (httpWebRequest != null) httpWebRequest.Abort(); if (httpWebResponse != null) httpWebResponse.Close(); return string.Empty; } } #endregion#region 获取字符流
/// <summary> /// 获取字符流 /// </summary> //--------------------------------------------------------------------------------------------------------------- // 示例: // System.Net.CookieContainer cookie = new System.Net.CookieContainer(); // Stream s = HttpHelper.GetStream("", cookie); // picVerify.Image = Image.FromStream(s); //--------------------------------------------------------------------------------------------------------------- /// <param name="url">地址</param> /// <param name="cookieContainer">cookieContainer</param> public static Stream GetStream(string url, CookieContainer cookieContainer) { currentTry++;HttpWebRequest httpWebRequest = null;
HttpWebResponse httpWebResponse = null;try
{ httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url); httpWebRequest.CookieContainer = cookieContainer; httpWebRequest.ContentType = contentType; httpWebRequest.ServicePoint.ConnectionLimit = maxTry; httpWebRequest.Referer = url; httpWebRequest.Accept = accept; httpWebRequest.UserAgent = userAgent; httpWebRequest.Method = "GET";httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
Stream responseStream = httpWebResponse.GetResponseStream(); currentTry--; return responseStream; } catch (Exception e) { if (currentTry <= maxTry) { GetHtml(url, cookieContainer); }currentTry--;
if (httpWebRequest != null)
{ httpWebRequest.Abort(); } if (httpWebResponse != null) { httpWebResponse.Close(); } return null; } } #endregion#region 清除HTML标记
///<summary> ///清除HTML标记 ///</summary> ///<param name="NoHTML">包括HTML的源码</param> ///<returns>已经去除后的文字</returns> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);//删除HTML
Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase); Htmlstring = regex.Replace(Htmlstring, ""); Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);Htmlstring.Replace("<", "");
Htmlstring.Replace(">", ""); Htmlstring.Replace("\r\n", "");return Htmlstring;
} #endregion#region 匹配页面的链接
/// <summary> /// 获取页面的链接正则 /// </summary> public string GetHref(string HtmlCode) { string MatchVale = ""; string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*"; foreach (Match m in Regex.Matches(HtmlCode, Reg)) { MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|"; } return MatchVale; } #endregion#region 匹配页面的图片地址
/// <summary> /// 匹配页面的图片地址 /// </summary> /// <param name="imgHttp">要补充的http://路径信息</param> public string GetImgSrc(string HtmlCode, string imgHttp) { string MatchVale = ""; string Reg = @"<img.+?>"; foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg)) { MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|"; }return MatchVale;
}/// <summary>
/// 匹配<img src="" />中的图片路径实际链接 /// </summary> /// <param name="ImgString"><img src="" />字符串</param> public string GetImg(string ImgString, string imgHttp) { string MatchVale = ""; string Reg = @"src=.+\.(bmp|jpg|gif|png|)"; foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg)) { MatchVale += (m.Value).ToLower().Trim().Replace("src=", ""); } if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1) return (MatchVale); else return (imgHttp + MatchVale); } #endregion#region 抓取远程页面内容
/// <summary> /// 以GET方式抓取远程页面内容 /// </summary> public static string Get_Http(string tUrl) { string strResult; try { HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(tUrl); hwr.Timeout = 19600; HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse(); Stream myStream = hwrs.GetResponseStream(); StreamReader sr = new StreamReader(myStream, Encoding.Default); StringBuilder sb = new StringBuilder(); while (-1 != sr.Peek()) { sb.Append(sr.ReadLine() + "\r\n"); } strResult = sb.ToString(); hwrs.Close(); } catch (Exception ee) { strResult = ee.Message; } return strResult; }/// <summary>
/// 以POST方式抓取远程页面内容 /// </summary> /// <param name="postData">参数列表</param> public static string Post_Http(string url, string postData, string encodeType) { string strResult = null; try { Encoding encoding = Encoding.GetEncoding(encodeType); byte[] POST = encoding.GetBytes(postData); HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url); myRequest.Method = "POST"; myRequest.ContentType = "application/x-www-form-urlencoded"; myRequest.ContentLength = POST.Length; Stream newStream = myRequest.GetRequestStream(); newStream.Write(POST, 0, POST.Length); //设置POST newStream.Close(); HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse(); StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default); strResult = reader.ReadToEnd(); } catch (Exception ex) { strResult = ex.Message; } return strResult; } #endregion#region 压缩HTML输出
/// <summary> /// 压缩HTML输出 /// </summary> public static string ZipHtml(string Html) { Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符 Html = Regex.Replace(Html, @"\r\n\s*", ""); Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase); return Html; } #endregion#region 过滤指定HTML标签
/// <summary> /// 过滤指定HTML标签 /// </summary> /// <param name="s_TextStr">要过滤的字符</param> /// <param name="html_Str">a img p div</param> public static string DelHtml(string s_TextStr, string html_Str) { string rStr = ""; if (!string.IsNullOrEmpty(s_TextStr)) { rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase); rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase); } return rStr; } #endregion#region 加载文件块
/// <summary> /// 加载文件块 /// </summary> public static string File(string Path, System.Web.UI.Page p) { return @p.ResolveUrl(Path); } #endregion#region 加载CSS样式文件
/// <summary> /// 加载CSS样式文件 /// </summary> public static string CSS(string cssPath, System.Web.UI.Page p) { return @"<link href=""" + p.ResolveUrl(cssPath) + @""" rel=""stylesheet"" type=""text/css"" />" + "\r\n"; } #endregion#region 加载JavaScript脚本文件
/// <summary> /// 加载javascript脚本文件 /// </summary> public static string JS(string jsPath, System.Web.UI.Page p) { return @"<script type=""text/javascript"" src=""" + p.ResolveUrl(jsPath) + @"""></script>" + "\r\n"; } #endregion}