common_requests.py

import requests
from random_headers import get_headers


def getHTMLText(url, headers=None):
    try:
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()  # 如果状态码不是200,引发HTTPError异常
        r.encoding = r.apparent_encoding  # 替换编码方式,防止显示中文乱码
        return r
    except:
        print("出现异常,爬取失败!")

if __name__ == "__main__":
    headers = get_headers()  # 随机headers,反反爬虫措施
    url = input("请输入需要爬取的网页URL:")
    print("-----------------------------------------------")
    print("状态码:", getHTMLText(url, headers).status_code)
    print(getHTMLText(url, headers).text[1000:1800])


random_headers.py

import random


def get_headers():
    # user_agent 列表
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
        'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 '
        'Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 '
        'Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 '
        'Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET '
        'CLR '
        '2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    ]

    # 随机选择一个
    user_agent = random.choice(user_agent_list)

    # 传递给headers
    headers = {'User-Agent': user_agent}
    return headers