The following script retrieves just the text description (minus HTML tags) from the Google News RSS feed for Edinburgh.


require 'open-uri'
require 'rexml/document'
require 'nokogiri'
require 'builder'
include REXML

keyword = 'edinburgh'
url = "http://news.google.com/news?pz=1&cf=all&ned=uk&hl=en&q=#{URI.escape(keyword)}&cf=all&output=rss"

buffer = open(url, 'UserAgent' => 'S-Rscript').read
doc = Document.new(buffer)
nodes = XPath.match(doc.root,"//item")

desc = nodes[1].text('description').to_s
doc = Nokogiri::HTML(desc)
doc2 = Document.new(doc.xpath('html/body').to_xml)

xpath = "table/tr/td[2]/font/div[@class='lh']"
node = XPath.first(doc2.root, xpath)
a_tag = XPath.first node, 'a'
font_tag = XPath.first node, 'font'

node.delete a_tag
node.delete font_tag
desc = node.to_s.gsub(/<\/?[^>]*>|\n/, "").strip[/.*(?=\.\.\.)/]

#=> "Mike Blair is among three Scotland internationals who return to the Edinburgh starting line-up for the Magners League clash with Scarlets on Friday. "

Read more: http://feeds.dzone.com/~r/dzone/snippets/~3/h6gN32FfGlo/10693