Class String
In: lib/feedparser/text-output.rb
lib/feedparser/textconverters.rb
Parent: Object
String\n[lib/feedparser/text-output.rb\nlib/feedparser/textconverters.rb] TopLevel

This class provides various converters

Methods

Public Instance methods

returns true if the text contains escaped HTML (with HTML entities). used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 13
13:   def escaped_html?
14:     return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/) || (self =~ /<p>/)
15:   end

Convert an HTML text to plain text

[Source]

    # File lib/feedparser/text-output.rb, line 6
 6:   def html2text
 7:     text = self.clone
 8:     # parse HTML
 9:     p = FeedParser::HTML2TextParser::new(true)
10:     p.feed(text)
11:     p.close
12:     text = p.savedata
13:     # remove leading and trailing whilespace
14:     text.gsub!(/\A\s*/m, '')
15:     text.gsub!(/\s*\Z/m, '')
16:     # remove whitespace around \n
17:     text.gsub!(/ *\n/m, "\n")
18:     text.gsub!(/\n */m, "\n")
19:     # and duplicates \n
20:     text.gsub!(/\n\n+/m, "\n\n")
21:     text
22:   end

is this text HTML ? search for tags. used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 8
 8:   def html?
 9:     return (self =~ /<p>/) || (self =~ /<\/p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/)
10:   end

Remove white space around the text

[Source]

    # File lib/feedparser/textconverters.rb, line 51
51:   def rmWhiteSpace!
52:     return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
53:   end

convert text to HTML

[Source]

    # File lib/feedparser/textconverters.rb, line 35
35:   def text2html
36:     text = self.clone
37:     return text if text.html?
38:     if text.escaped_html?
39:       return text.unescape_html
40:     end
41:     # paragraphs
42:     text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
43:     text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
44:     # uris
45:     text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
46:         '<a href="\1">\1</a>')
47:     text
48:   end

Convert a text in inputenc to a text in UTF8 must take care of wrong input locales

[Source]

    # File lib/feedparser/textconverters.rb, line 57
57:   def toUTF8(inputenc)
58:     if inputenc.downcase != 'utf-8'
59:       # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
60:       begin
61:         if self.unpack('U*').pack('U*') == self
62:           return self
63:         end
64:       rescue
65:         # do nothing
66:       end
67:       begin
68:         return self.unpack('C*').pack('U*')
69:       rescue
70:         return self #failsafe solution. but a dirty one :-)
71:       end
72:     else
73:       return self
74:     end
75:   end

un-escape HTML in the text. used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 18
18:   def unescape_html
19:     {
20:       '<' => '&lt;',
21:       '>' => '&gt;',
22:       "'" => '&apos;',
23:       '"' => '&quot;',
24:       '&' => '&amp;',
25:       "\047" => '&#39;',
26:       "\046" => '&#038;',
27:       "\046" => '&#38;'
28:     }.each do |k, v|
29:       gsub!(v, k)
30:     end
31:     self
32:   end

[Validate]