Jump to content
Romania-

HTTP Proxy Scraper(Python 2.7)

Recommended Posts

#Python 2.7.7 HTTP Proxy Scraper
import urllib2
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from random import randint
dots ='..............................................................................?..'
print'Python 2.7.7'
print' Welcome to Python Proxy Scraper'
print' Type "Help" for Help'
print' 14 User-Agent Edition'
print(dots)
cL = ['view user agents', 'help', 'credits', 'command list', 'start',]
userAgents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/4.0 (compatible; MSIE 7.0; America Online Browser 1.1; Windows NT 5.1; (R1 1.5); .NET CLR 2.0.50727; InfoPath.1)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061026 BonEcho/2.0',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.12pre) Gecko/20080103 BonEcho/2.0.0.12pre',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.9) Gecko/20071113 BonEcho/2.0.0.9',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)',
'Mozilla/4.0 (compatible; MSIE 4.01; AOL 4.0; Windows 98)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; InfoPath.2; .NET CLR 2.0.50727; Alexa Toolbar)',
'Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; SV1; TheWorld)',
]
mainUrl = ('http://free-proxy-list.net/') # Target web page.
mainLoop = True
searchLoop = False
searchLoop = False
myInt1 = (0) # I've used this as a counter for later, once it reaches a certain number it will reset.
myInt2 = (0) # That Certain number is the pattern location for the proxies within a table, example.. there is a proxy every 8 cells.
while mainLoop:
userInput = raw_input('<>< ')
if userInput ==(cL[0]): # view agents
for x in range(0, (len(userAgents))):
print(dots)
print(x, userAgents[x])
print(dots)
elif userInput ==(cL[1]): # help
print('<>< Version 1.0')
print('<>< Compatible with Python 2.7.7, maybe unstable on other versions.')
print('<>< Type command list for a list of commands..')
print('<>< Feel free to edit/tweak my script give credit where needed if repost.')
print(dots)
print('<>< In order to get this to work you might need to tweak some veriables in the code sorry skids...')
print('<>< Default Target http://free-proxy-list.net.')
print('<>< This script will scrape proxies from mainUrl.')
print('<>< The script will then use BeautifulSoup to isolate the table containing the proxies and ports.')
print('<>< The script will then remove any unwanted characters and spaces and present you a nicely formated list of proxies')
print(dots)
elif userInput ==(cL[2]): # credits
print('<>< Kopuz 2014')
print('<>< Feel free to edit/tweak my script give credit where needed if repost.')
print(dots)
elif userInput ==(cL[3]): # command list
print(dots)
print(cL)
print(dots)
elif userInput ==(cL[4]): # start
searchLoop = True
while searchLoop:
try:
agentSelect = randint(0, 14)
webReq = urllib2.Request(mainUrl)
print('<>< Starting Proxy Scrape At ' + mainUrl)
webReq.add_unredirected_header('User-Agent', userAgents[agentSelect])
print('<>< Agent: ' + userAgents[agentSelect])
thePage = urlopen(webReq)
theText = thePage.read()
print('<>< Raw data gathered, would you like to sort and view? y/n')
userInput = raw_input('<>< ')
if userInput == ("y"):
soup = BeautifulSoup(theText)
rawProx = soup.find('tbody') # Key world to isolate proxy table in HTML document.
tableD = rawProx.findAll('td') # To furth isolate the table cells.
for x in xrange(len(tableD)):
myInt1 += 1
myInt2 += 1
if myInt1 == 8: # Every 8 cell is a proxy.
myInt = myInt2 +1 # Assuming the port is in the next cell.
strBuilder = (tableD[myInt2],":",tableD[myInt])
theString = str(strBuilder)
noSpace = theString.replace(" ", "")
noComma = noSpace.replace(",", "")
noTd = noComma.replace("<td>", "")
noCtd = noTd.replace("</td>", "")
noSQ = noCtd.replace("'", "")
noBo = noSQ.replace("(", "")
noBc = noBo.replace(")", "")
print (noBc)
myInt1 = 0
searchLoop = False
elif userInput == ("n"):
searchLoop = False
except Exception:
continue
else:
print'<>< Unknown Command'

Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...