dobtco · eabh · Mar 22, 2014 · Mar 22, 2014 · Mar 24, 2014 · Mar 24, 2014
diff --git a/scrapers/fl/config.yml b/scrapers/fl/config.yml
@@ -0,0 +1,2 @@
+name: 'Florida'
+index_url: 'http://www.myflorida.com/apps/vbs/vbs_www.search_r1.matching_ads_page'
diff --git a/scrapers/fl/rfps.coffee b/scrapers/fl/rfps.coffee
@@ -0,0 +1,184 @@
+###
+Schema items not provided are marked x below:
+	id 	A unique identifier string
+ 	type 	The type of posting, e.g. RFP or RFI. 
+	html_url 	A link to the RFP page
+	title 	Title
+ x	department_name 	Department name
+ 	address 	Full address related to this RFP (will be normalized later)
+ 	awarded 	Boolean - has the RFP been awarded? (Leave blank for unknown)
+ x	canceled 	Boolean - has the RFP been canceled? (Leave blank for unknown)
+ 	contact_name 	Contact name
+ 	contact_phone 	Contact phone
+ 	contact_fax 	Contact fax
+ 	contact_email 	Contact email
+ 	created_at 	When was this RFP posted?
+ 	updated_at 	When was this RFP revised?
+ x	responses_open_at 	When do responses open?
+ 	responses_due_at 	When are responses due?
+ 	description 	Text/HTML description
+ x	prebid_conferences 	Array of Conference objects
+ 	downloads 	Array of file URLs
+ 	nigp_codes 	Array of NIGP codes
+ x	commodity 	String representing the commodity (we'll try to match it to a code)
+ x	estimate 	Estimated cost of the contract
+ x	duration 	Duration of contract
+###
+# Require the necessary modules.
+request = require 'request'
+cheerio = require 'cheerio'
+async = require 'async'
+_ = require 'underscore'
+require 'colors'
+
+# Set up some constants that we'll use later.
+TYPE_TYPES = # based on "Ad Type" in list or 2nd text line in detail screen
+  Competitive_Solicitation: 'ITB'           # Invitation to bid
+  Invitation_to_Negotiate: 'RFP'            # Request for proposal
+  Request_for_Proposal: 'RFP'               # Request for proposal
+  Invitation_to_Bid: 'ITB'                  # Invitation to bid
+  Agency_Decisions: 'XXX'                   # Closed proposals
+  Single_Source: 'XXX'                      # Closed proposals
+#  Some_string_maybe: 'RFQ'                  # Request for quotes
+  Informational_Notice: 'RFP'               # Request for proposal
+  Request_for_Information: 'RFI'            # Request for information
+  Public_Meeting_Notice: 'RFP'              # Request for proposal
+  Single_Source_Announcements_Awards: 'XXX' # Closed proposals
+
+MISC_PARAMS =    # Note sequence here must match sequence in html!!!
+#  id2:              'Advertisement Number:' # Redundant already got from list
+  version_nr:       'Version Number:'
+  created_at:       'Advertisement Begin Date/Time:'
+  responses_due_at: 'Advertisement End Date/Time:'
+  updated_at:       'Last Edit:'
+  contact_name:     'Please direct all questions to:'
+  contact_phone:    'Phone:'   
+  contact_fax:      'FAX:'  
+  contact_email:    'Email:'
+
+EDIT_LENGTHS =
+#  id2:              30
+  version_nr:        3
+  created_at:       45
+  responses_due_at: 45
+  updated_at:       45
+#  contact_name:     45
+  contact_phone:    14  
+  contact_fax:      14  
+  contact_email:    50
+
+BASE_URL = 'http://www.myflorida.com'
+WANT_URL = '/apps/vbs/vbs_www.search_r1.matching_ads_page'
+DOWNLOAD_URL = "/apps/vbs/vbs_pdf.download_file?p_file="
+ASYNC_RQ_MAX = 5
+
+# We'll export one function, that takes two parameters: an options hash,
+# and a callback that must be executed once we're done scraping.
+module.exports = (opts, done) ->
+
+  # Set up an empty array for our RFPs.
+  rfps = []
+
+  unless opts.limit
+    opts.limit = 9999
+
+  request.get("#{BASE_URL}#{WANT_URL}", (err, response, html) ->
+
+    # Load the resulting HTML into Cheerio
+    $ = cheerio.load html
+
+    $("#OutTable").children( 'tr' ).slice(1,opts.limit+1).each (i, el) -> 
+      rfps.push {
+        id:   $(@).find("td").eq(1).text().trim()
+        type: TYPE_TYPES[$(@).find("td").eq(3).text().trim().replace /[\s]/g, '_']
+        title: $(@).find("td").eq(0).text().trim()
+        version: $(@).find("td").eq(2).text().trim()
+        html_url: "#{BASE_URL}#{$(@).find('a').attr('href').trim()}"
+        end_date: $(@).find("td").eq(4).text().trim()}
+
+    # Make up to ASYNC_RQ_MAX concurrent requests to the procurement site.
+    # We call the getRfpDetails() function for each one.
+    # Callback the done() function passed in the `module.exports` call.
+    async.eachLimit rfps, ASYNC_RQ_MAX, getRfpDetails, (err) ->
+      console.log(err.red) if err
+
+      done rfps
+  )
+
+  # A function for scraping the details from an RFP page.
+  getRfpDetails = (item, cb) ->
+
+    if item.type is TYPE_TYPES.Agency_Decisions
+      item.awarded = true
+      item.type = TYPE_TYPES.Request_for_Proposal
+
+    # GET request for the RPF details: load the html response into Cheerio
+    request.get item.html_url, (err, response, body) ->
+      $ = cheerio.load body
+
+      offset = []
+      item.address = []
+      item.downloads = []
+      item.nigp_codes = []
+
+      $tr = $('body').children('table').eq(0).children('tr').eq(1)
+      $b = $tr.find('b')
+
+      # Most data items are in freely formatted text, so we search
+      # for their locations using their labels, then truncate the found 
+      # text to an appropriate length
+      loc = 0
+      for k, v of MISC_PARAMS
+        loc = v.length + 1 + $tr.text().indexOf v, loc
+        item[k] = $tr.text()
+          .substring loc, loc+300
+        if item[k]
+          item[k] = item[k].trim()
+        else
+          item[k] = ''		
+        offset[k] = loc
+
+      # Contact address has no label, but follows the phone &/or fax number(s)
+      if item.contact_fax isnt ''
+        address = item.contact_fax
+      else
+        address = item.contact_phone
+      sss = ((address.split 'Email')[0].substring 20).split '\n'
+      lll = [0..sss.length-1]
+      sss[k] = sss[k].trim() for k in lll
+      (item.address.push sss[k] if sss[k].length > 0) for k in lll   
+      item.contact_name = (item.contact_name.split '\n')[0].trim()
+      item.contact_email = (item.contact_email.split '\n')[0].trim()
+
+      item.agency = $b.eq(4).text().trim() 
+
+      # Type text is more granular on the detail page so recompute it
+      item.type = TYPE_TYPES[$b.eq(5).text().trim().replace /[\s\/]/g, '_']
+      if item.type is TYPE_TYPES.Agency_Decisions
+        item.awarded = true
+        item.type = TYPE_TYPES.Request_for_Proposal
+
+      last_nigp_code = ''
+      $tr.find('table')
+        .eq(1)
+        .find('tr[valign="top"]')
+        .each (i, el) ->
+          last_nigp_code = $(@).find('td').eq(0).text().trim()
+          item.nigp_codes.push last_nigp_code.replace /[\s+\-]/g, ''
+
+      $tr.find('table:contains(Downloadable Files for Advertisement)').find('tbody')
+        .find('a').each (i, el) -> (
+          if DOWNLOAD_URL is $(@).attr('href').substring  0,DOWNLOAD_URL.length
+            item.downloads.push BASE_URL+$(@).attr('href'))
+
+      item.description =  # located after last NIGP code & before contact details
+        ((($tr.text().substring offset.updated_at, offset.contact_name)
+          .split last_nigp_code)[1]
+          .split 'Please direct all questions to:')[0]
+          .trim()
+          .replace /\s+/g, ' ' # Adjust this if desired to retain line breaks
+
+      for k, v of EDIT_LENGTHS
+        item[k] = (item[k].substring 0, v).trim()
+
+      cb()
diff --git a/scrapers/nh/rfps.coffee b/scrapers/nh/rfps.coffee
@@ -1,3 +1,30 @@
+###
+Schema items not provided are marked x below:
+	id 	A unique identifier string
+ 	type 	The type of posting, e.g. RFP or RFI. 
+	html_url 	A link to the RFP page
+	title 	Title
+ 	department_name 	Department name
+ x	address 	Full address related to this RFP (will be normalized later)
+ 	awarded 	Boolean - has the RFP been awarded? (Leave blank for unknown)
+ x	canceled 	Boolean - has the RFP been canceled? (Leave blank for unknown)
+ 	contact_name 	Contact name
+ x	contact_phone 	Contact phone
+ x	contact_fax 	Contact fax
+ 	contact_email 	Contact email
+ 	created_at 	When was this RFP posted?
+ x	updated_at 	When was this RFP revised?
+ x	responses_open_at 	When do responses open?
+ 	responses_due_at 	When are responses due?
+ x	description 	Text/HTML description
+ x	prebid_conferences 	Array of Conference objects
+ 	downloads 	Array of file URLs
+ x	nigp_codes 	Array of NIGP codes
+ 	commodity 	String representing the commodity (we'll try to match it to a code)
+ x	estimate 	Estimated cost of the contract
+ x	duration 	Duration of contract
+###
+
 # Require the necessary modules.
 request = require 'request'
 cheerio = require 'cheerio'
@@ -6,27 +33,6 @@ _ = require 'underscore'
 require 'colors'
 
 # Set up some constants that we'll use later.
-FILTER_PARAMS =
-  track: ''
-  bidResponse: 'all'
-  theType: 'OPEN'
-  govType: 'state'
-  theAgency: 'all'
-  theWord: ''
-  theSort: 'BID NUMBER'
-
-# Schema items not provided:
-#  address            Full address related to this RFP (will be normalized later)
-#  canceled           Boolean - has the RFP been canceled? (Leave blank for unknown)
-#  contact_phone      Contact phone
-#  contact_fax        Contact fax
-#  updated_at         When was this RFP revised?
-#  responses_open_at  When do responses open?
-#  description 	      Text/HTML description
-#  prebid_conferences Array of Conference objects
-#  nigp_codes         Array of NIGP codes
-#  estimate           Estimated cost of the contract
-#  duration           Duration of contract
 
 # Could source any of the following from detail by uncommenting the relevant line
 # Marked with ** if conceptually available from detail screen but not from list
@@ -78,14 +84,17 @@ module.exports = (opts, done) ->
   # Set up an empty array for our RFPs.
   rfps = [];
 
+  unless opts.limit
+    opts.limit = 9999
+
   wanturl = BASE_URL+WANT_URL
 
   # Send a GET request to the site's endpoint
   request.get(wanturl, (err, response, html) ->
 
     # Load the resulting HTML into Cheerio
     $ = cheerio.load html
-    $('body').find('table').eq(3).find('tr').each( (i, el) ->
+    $('body').find('table').eq(3).find('tr').slice(2,opts.limit+2).each( (i, el) ->
       gobj =
       ( _.object(LIST_PARAMS,
         $(@).find('td').eq(k).text().trim() for k in WANTED_COLS))
@@ -108,8 +117,6 @@ module.exports = (opts, done) ->
         gobj.type = ''
       rfps.push gobj )
 
-    rfps = _.last(rfps, rfps.length-2)  # first two rows contain headers
-
     async.eachLimit rfps, ASYNC_RQ_MAX, getRfpDetails, (err) ->
       console.log(err.red) if err
     done rfps
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		name: 'Florida'
		index_url: 'http://www.myflorida.com/apps/vbs/vbs_www.search_r1.matching_ads_page'