Search the articles  
  

• Striping HTML Tag

Friday, August 21, 2009

If you have a raw text data with formatted HTML tags that come from search engine crawler or specific data sources and you need the text data without html tags, this method will do striping html tags from your string data type.

// remove html tage from string
public string RemoveHTML(string strText)
{
   string TAGLIST =
         ";!--;!DOCTYPE;A;ACRONYM;ADDRESS;APPLET;AREA;B;BASE;BASEFONT;" +
         "BGSOUND;BIG;BLOCKQUOTE;BODY;BR;BUTTON;CAPTION;CENTER;CITE;CODE;" +
         "COL;COLGROUP;COMMENT;DD;DEL;DFN;DIR;DIV;DL;DT;EM;EMBED;FIELDSET;" +
         "FONT;FORM;FRAME;FRAMESET;HEAD;H1;H2;H3;H4;H5;H6;HR;HTML;I;IFRAME;IMG;" +
         "INPUT;INS;ISINDEX;KBD;LABEL;LAYER;LAGEND;LI;LINK;LISTING;MAP;MARQUEE;" +
         "MENU;META;NOBR;NOFRAMES;NOSCRIPT;OBJECT;OL;OPTION;P;PARAM;PLAINTEXT;" +
         "PRE;Q;S;SAMP;SCRIPT;SELECT;SMALL;SPAN;STRIKE;STRONG;STYLE;SUB;SUP;" +
         "TABLE;TBODY;TD;TEXTAREA;TFOOT;TH;THEAD;TITLE;TR;TT;U;UL;VAR;WBR;XMP;TODAYSYSTEM;";

   const string BLOCKTAGLIST = ";APPLET;EMBED;FRAMESET;HEAD;NOFRAMES;NOSCRIPT;OBJECT;SCRIPT;STYLE;";

   int nPos1 = 0;
   int nPos2 = 0;
   int nPos3 = 0;
   string strResult = "";
   string strTagName = "";
   bool bRemove;
   bool bSearchForBlock;

   nPos1 = strText.IndexOf("<");
   while (nPos1 >= 0)
   {
      nPos2 = strText.IndexOf(">", nPos1 + 1);
      if (nPos2 >= 0)
      {
         strTagName = strText.Substring(nPos1 + 1, nPos2 - nPos1 - 1);

         strTagName = strTagName.Replace("\r", " ").Replace("\n", " ");

         nPos3 = strTagName.IndexOf(" ");

         if (nPos3 > 0) strTagName = strTagName.Substring(0, nPos3);

         if (strTagName.Substring(0, 1) == "/")
         {
            strTagName = strTagName.Substring(1);
            bSearchForBlock = false;
         }
         else bSearchForBlock = true;

         if (TAGLIST.IndexOf(";" + strTagName.ToUpper() + ";", 0) > 0)
         {
            bRemove = true;
            if (bSearchForBlock)
            {
               if (BLOCKTAGLIST.IndexOf(";" + strTagName.ToUpper() + ";") > 0)
               {
                  nPos2 = strText.Length;
                  nPos3 = strText.IndexOf("</" + strTagName, nPos1 + 1);
                  if (nPos3 > 0) nPos3 = strText.IndexOf(">", nPos3 + 1);
                  if (nPos3 > 0) nPos2 = nPos3;
               }
            }
         }
         else bRemove = false;

         if (bRemove)
         {
            strResult = strResult + strText.Substring(0, nPos1);
            strText = strText.Substring(nPos2 + 1);
         }
         else
         {
            strResult = strResult + strText.Substring(nPos1);
            strText = strText.Substring(nPos1 + 1);
         }
      }
      else
      {
         strResult = strResult + strText;
         strText = "";
      }

      nPos1 = strText.IndexOf("<");
   }
   strResult = strResult + strText;
   strResult = strResult.Replace("\r\n\r\n", "\r\n");
   return strResult;
}

No comments:

Post a Comment