提取HTML代码中文字的C#函数

    技术2022-05-11  8

    ///提取HTML代码中文字的C#函数 /// <summary>/// 去除HTML标记/// </summary>/// <param name="strHtml">包括HTML的源码 </param>/// <returns>已经去除后的文字</returns>using System;using System.Text.RegularExpressions;public class StripHTMLTest{  public static void Main(){    string s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");    Console.WriteLine(s);  }

      public static string StripHTML(string strHtml){    string [] aryReg ={          @"<script[^>]*?>.*?</script>",

              @"<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>",          @"([/r/n])[/s]+",          @"&(quot|#34);",          @"&(amp|#38);",          @"&(lt|#60);",          @"&(gt|#62);",           @"&(nbsp|#160);",           @"&(iexcl|#161);",          @"&(cent|#162);",          @"&(pound|#163);",          @"&(copy|#169);",          @"&#(/d+);",          @"-->",          @"<!--.*/n"         };

        string [] aryRep = {           "",           "",           "",           "/"",           "&",           "<",           ">",           " ",           "/xa1",//chr(161),           "/xa2",//chr(162),           "/xa3",//chr(163),           "/xa9",//chr(169),           "",           "/r/n",           ""          };

        string newReg =aryReg[0];    string strOutput=strHtml;    for(int i = 0;i<aryReg.Length;i++){      Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase);      strOutput = regex.Replace(strOutput,aryRep[i]);    }    strOutput.Replace("<","");    strOutput.Replace(">","");    strOutput.Replace("/r/n","");    return strOutput;  }}


    最新回复(0)