'''
robotsReader.py

Copyright 2006 Andres Riancho

This file is part of w3af, w3af.sourceforge.net .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

'''

import core.controllers.outputManager as om
from core.controllers.basePlugin.baseDiscoveryPlugin import baseDiscoveryPlugin
import core.data.kb.knowledgeBase as kb
import core.data.parsers.urlParser as urlParser
from core.controllers.w3afException import w3afRunOnce

class robotsReader(baseDiscoveryPlugin):
	'''
	This plugin returns a list of new directories found at robots.txt file.
	@author: Andres Riancho ( andres.riancho@gmail.com )
	'''

	def __init__(self):
		baseDiscoveryPlugin.__init__(self)
		self._exec = True

	def discover(self, fuzzableRequest ):
		'''
		Get the robots.txt file and parse it.
		
		@parameter fuzzableRequest: A fuzzableRequest instance that contains (among other things) the URL to test.
		'''
		if not self._exec:
			# This will remove the plugin from the discovery plugins to be runned.
			raise w3afRunOnce()
		else:
			# Only run once
			self._exec = False
			self.is404 = kb.kb.getData( 'error404page', '404' )
			
			dirs = []
			self._fuzzableRequests = []			
			
			baseUrl = urlParser.baseUrl( fuzzableRequest.getURL() )
			robotsUrl = urlParser.urlJoin(  baseUrl , 'robots.txt' )
			response = self._urlOpener.GET( robotsUrl, useCache=True )
			
			if not self.is404( response ):
				dirs.append( robotsUrl )
				for line in response.getBody().split('\n'):
					if len(line) > 0 and line[0] != '#' and (line.upper().find('ALLOW') == 0 or line.upper().find('DISALLOW') == 0 ):
						url = line[ line.find(':') + 2 : ]
						url = urlParser.urlJoin(  baseUrl , url )
						dirs.append( url )
						om.out.information( 'robotsReader found a new URL: ' + url )

			for url in dirs:
				response = self._urlOpener.GET( url, useCache=True )
				fuzzReqs = self._createFuzzableRequests( response )
				self._fuzzableRequests.extend( fuzzReqs )
		
		return self._fuzzableRequests
		
	def getOptionsXML(self):
		'''
		This method returns a XML containing the Options that the plugin has.
		Using this XML the framework will build a window, a menu, or some other input method to retrieve
		the info from the user. The XML has to validate against the xml schema file located at :
		w3af/core/ui/userInterface.dtd
		
		@return: XML with the plugin options.
		'''	
		return	'<?xml version="1.0" encoding="ISO-8859-1"?>\
		<OptionList>\
		</OptionList>\
		'

	def setOptions( self, OptionList ):
		'''
		This method sets all the options that are configured using the user interface 
		generated by the framework using the result of getOptionsXML().
		
		@parameter OptionList: A dictionary with the options for the plugin.
		@return: No value is returned.
		'''	
		pass

	def getPluginDeps( self ):
		'''
		@return: A list with the names of the plugins that should be runned before the
		current one.
		'''
		return ['discovery.error404page']

	def getLongDesc( self ):
		'''
		@return: A DETAILED description of the plugin functions and features.
		'''
		return '''
		This plugin searches for the robots.txt file, and parses it.
		
		This file is used to as an ACL that defines what URL's a search engine can access. By parsing this file, 
		you can get more information about the site.
		'''