python使用代理爬页面返回403: Forbidden

前几天给使用Python写的爬虫加上了可以使用代理访问爬取页面内容的功能，刚开始几天测试是正常的，可以正确返回所需内容，突然这两天一使用代理就全部返回“403: Forbidden”，怎么也没想明白是哪儿的问题，后来我都怀疑是不是我脚本哪里写错了……

def mrdede(url, headers={}, data=None, proxyip=None):
    # socket.setdefaulttimeout(6)
    info = ''
    if not proxyip:
        info += '非代理采集 '
        urllib2._opener = None
        request = urllib2.Request(url, headers=headers)
    else:
        info += '使用代理采集 '
        proxy_support = urllib2.ProxyHandler({'http': proxyip})
        opener = urllib2.build_opener(proxy_support)
        urllib2.install_opener(opener)
        request = urllib2.Request(url, headers=headers)
    if data is not None:
        data = urllib.urlencode(data)
        info += '[POST] data: %s' % data
    else:
        info += '[GET] '
    print info.decode('utf-8')  # #####
    try:
        resp = urllib2.urlopen(request, data, timeout=8)
        # resp = opener.open(url, data)
        print resp.read().decode('utf-8')
        # time.sleep(random.randint(60, 180))
    except urllib2.HTTPError as e:
        print 'HTTPError! The bad proxy is %s %s' % (proxyip, e)
    except urllib2.URLError as e:
        print 'URLError! The bad proxy is %s %s' % (proxyip, e)
    except BaseException as e:
        print 'Unknown Errors! The bad proxy is %s %s' % (proxyip, e)

def mrdede(url, headers={}, data=None, proxyip=None):

# socket.setdefaulttimeout(6)

info = ''

if not proxyip:

info += '非代理采集 '

urllib2._opener = None

request = urllib2.Request(url, headers=headers)

else:

info += '使用代理采集 '

proxy_support = urllib2.ProxyHandler({'http': proxyip})

opener = urllib2.build_opener(proxy_support)

urllib2.install_opener(opener)

request = urllib2.Request(url, headers=headers)

if data is not None:

data = urllib.urlencode(data)

info += '[POST] data: %s' % data

else:

info += '[GET] '

print info.decode('utf-8') # #####

try:

resp = urllib2.urlopen(request, data, timeout=8)

# resp = opener.open(url, data)

print resp.read().decode('utf-8')

# time.sleep(random.randint(60, 180))

except urllib2.HTTPError as e:

print 'HTTPError! The bad proxy is %s %s' % (proxyip, e)

except urllib2.URLError as e:

print 'URLError! The bad proxy is %s %s' % (proxyip, e)

except BaseException as e:

print 'Unknown Errors! The bad proxy is %s %s' % (proxyip, e)

上面是一段测试代码，使用代理就返回403，最后实在是没办法了，想着在群里请教一下，结果有一位群友（3245935173）说让我试试使用“https”，测试结果，返回正确数据了，原来这是错在这里。

因为之前使用http协议是正确的，一定是代理网站后来修改了访问协议(http -> https)，真是万万没想到啊。

织梦先生

一个从零开始并且为了自己心底的理想不断奋斗中的人

python使用代理爬页面返回403: Forbidden