Contents 
 Index 
 "Perl Program Reference" 
 < Previous 
 Next > 

strip_html.pl

Go to the documentation of this file.
00001 // This file has been modified on-the-fly with an input filter
00002 // to change it from Perl syntax to C++ strictly for the purposes
00003 // of faking out Doxygen. Modifications include:
00004 
00005 // - changing local() definitions to C++ #define statements.
00006 // - commenting out undef statements.
00007 // - changing $globe'... variable names to $globe_...
00008 // - changing sub statements to look like C++ functions.
00009 // - changing # comments to C++ comments.
00010 // - ...
00011 
00012 // If you see other strangeness in the HTML version of the Perl file,
00013 // it comes from getting it to look more C++ like.
00014 
00015 
00016 // #!/usr/#define/bin/perl
00017 
00018 /////////////////////////////////////////////////////////////////////////////////
00019 /** @file
00020  ** @brief Strip out all HTML except for href's.
00021  ** 
00022  **
00023  ** @param Input source file.
00024  ** @return Output is file with changes.
00025  **
00026  ** @ingroup tp_tools tp_dox
00027  **
00028  ** @author Glenn C. Maxey
00029  **/
00030 // #
00031 //// $Id: strip_html.pl,v 1.1 2002/02/28 00:43:01 gmaxe Exp $
00032 ////
00033 //// 2002 Created by Voyant Technologies, Inc., Westminster, Colorado, USA.
00034 ////
00035 //// Permission to use, copy, modify, and distribute this software and its 
00036 //// documentation under the terms of the GNU General Public License is hereby 
00037 //// granted. No representations are made about the suitability of this software 
00038 //// for any purpose. It is provided "as is" without express or implied warranty. 
00039 //// See the GNU General Public License (http://www.gnu.org/copyleft/gpl.html) 
00040 //// for more details.
00041 //// 
00042 //// Documents produced by this script are derivative works derived from the 
00043 //// input used in their production; they are not affected by this license.
00044 ////
00045 //// $Log: strip_html.pl,v $
00046 //// Revision 1.1  2002/02/28 00:43:01  gmaxe
00047 //// New file for stripping down an HTML file for purposes of a list.
00048 ////
00049 //// Revision 1.1  2002/02/16 00:59:40  gmaxe
00050 //// Added support for script files needed for TOC applet.
00051 ////
00052 ////
00053 /////////////////////////////////////////////////////////////////////////////////
00054 
00055 BEGIN {
00056    $href = "";  //  hyperlink stuff
00057    $separator0 = "\;0\;";
00058    $separator1 = "\;1\;";
00059    $line_count = 0;
00060    $in_buffer = "";
00061    $out_buffer = "";
00062 }
00063 
00064 
00065 //####
00066 // main program
00067 //####
00068 {
00069    NEW_LINE: while (<>) {
00070       //  Read in entire file into buffer
00071       $in_buffer .= $_;
00072    }
00073    if (0) {
00074 //       print "We think we read it in.\n";
00075 //       print $in_buffer;
00076    }
00077 
00078    $out_buffer = $in_buffer;
00079 
00080    while ($out_buffer =~ /\</) {
00081       @html_tag0 = split (/\</, $out_buffer, 2);
00082       $out_buffer .= $html_tag0[0];
00083     
00084       if ($html_tag0[1] =~ /\>/) {
00085          @html_tag1 = split (/\>/, $html_tag0[1], 2);
00086       } else {
00087 //          print "ERROR: Unmatching < and >\n";
00088       }
00089       if (($html_tag1[0] =~ /^a[\s+]/i) && ($html_tag1[0] =~ /href/i)) {
00090          $href = "$separator0$html_tag1[0]$separator1";
00091          if (0){
00092 //             print "href:$href\n";
00093          }
00094          $href = &get_rid_of_tag_params ($href, "class\=");
00095          $href = &get_rid_of_tag_params ($href, "target\=");
00096          //  get rid of carriage returns from within the hyperlink
00097          $href =~ s/\n//g;
00098          if (0){
00099 //             print "href:$href\n";
00100          }
00101       } else {
00102          $href = "";
00103       }
00104       //  rebuild line without what's in the middle, except for href.
00105       $out_buffer = "$html_tag0[0]$href$html_tag1[1]";
00106    } //  while there are HTML tags to remove
00107 
00108    $out_buffer =~ s/$separator0/\</g;
00109    $out_buffer =~ s/$separator1/\>/g;
00110    $out_buffer =~ s/\r//g;
00111    while ($out_buffer =~ /[\s+]\n$/){
00112       $out_buffer =~ s/[\s+]\n$/\n/g;
00113    }
00114    while ($out_buffer =~ /[\s*]\n[\s*]\n/){
00115       $out_buffer =~ s/[\s*]\n[\s*]\n/\n/g;
00116       $out_buffer =~ s/\n/\r\n/g;
00117    }
00118    
00119    
00120    if (1) {
00121       //  The true output; conditional helps in trouble shooting.
00122 //       print $out_buffer;
00123    }
00124 
00125 } //  main program
00126 
00127 
00128 
00129 
00130 //#######
00131 // get_rid_of_tag_params
00132 //#######
00133 int get_rid_of_tag_params  ( ) {
00134    $tag = $_[0];
00135    $param = $_[1];
00136    $delimiter = "\"";
00137    
00138    @section0 = split (/[\s]+$param/i, $tag, 2);
00139    @section1 = split (/$delimiter/, $section0[1], 3);
00140    
00141    // rebuild
00142    $out = "$section0[0]$section1[2]";
00143    return ($out);
00144 } //  get_rid_of_tag_param
00145 
00146 
00147 
00148 
00149 
00150 
00151 


 "Perl Program Reference" 
 < Previous 
 Next > 


Open-Source tools compliments of Voyant Technologies, Inc. and Glenn C. Maxey.
01/13/2003

TP Tools v2-00-0a

# tpt-perl-hcr-02