欢迎光临
我们一直在努力

Python爬取58同城租房数据 完美解决字体加密问题

前言

在这里我就不再一一介绍每个步骤的具体操作了,因为在爬取老版今日头条数据的时候都已经讲得非常清楚了,所以在这里我只会在重点上讲述这个是这么实现的,如果想要看具体步骤请先去看我今日头条的文章内容,里面有非常详细的介绍以及是怎么找到加密js代码和api接口。

私信小编01即可获取大量Python学习资料

 

58同城网站分析

58同城的数据爬取非常简单,唯一有点难的就是字体的加密,除此之外其他的数据用xpath即可获取。

想爬取不同地方的直接访问链接即可:

 


数据在链接中,直接请求获取即可。

 

 

 

字体加密破解

既然是字体加密那么就先把字体寻找出来,寻找简单,在开发者工具中的分类找到Font,然后搜索这个链接进行查找。

 


已经找到这个字体了,他是在请求页面的时候返回的,然后他还是个base64的,只需要转换一下再保存就可以了。

 

请求链接获取字体

import requests
from lxml import etree

def get_data():
    url = "https://bj.58.com/chuzu/?PGTID=0d200001-0000-11e9-58e6-a658f219b27c&ClickID=1"
    headers = {
        'authority': 'bj.58.com',
        'method': 'GET',
        'path': '/chuzu/?PGTID=0d200001-0000-11e9-58e6-a658f219b27c&ClickID=1',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': 'f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=bj%7C%E5%8C%97%E4%BA%AC%7C0; userid360_xml=C0E9739B2022549506AFBC01231A1DAA; time_create=1606640420140; xxzl_cid=f4a781439d9247f393d0a1629bec00df; xzuid=e0e5ea78-ac5a-491b-819d-a869ab37a7a7; xxzl_deviceid=2G3xFS3qwOviMHxtC%2FVEituhpmiI%2FJ%2BAmJ08cPBulZSe7LcSgT98WgFcyNDbzMXJ; id58=c5/nfF+bz1xVS0tAA7tjAg==; 58tj_uuid=116f1ed0-7c25-477e-8887-be3602fa2389; new_uv=1; utm_source=; spm=; init_refer=https%253A%252F%252Fbj.58.com%252Fchuzu%252Fsub%252Fpn70%252F%253Fpagetype%253Dditie%2526PGTID%253D0d3090a7-0000-1b87-3e2e-c6efe8d19973%2526ClickID%253D2; wmda_uuid=13712f08f0e555f110b1b2684ce9d709; wmda_new_uuid=1; wmda_session_id_11187958619315=1604046685879-d3ad7e5f-77f6-29d7; wmda_visited_projects=%3B11187958619315; als=0; f=n; new_session=0',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    return response.text

def get_font(data):
    html = etree.HTML(data)
    script = html.xpath('//script[2]/text()')[0]
    ttf = re.findall(".*?src:url('data:application/font-ttf;charset=utf-8;base64,(.*?)').*?",script,re.S)[0]
    with open('fangchan-secret.ttf','wb') as f:
        f.write(base64.b64decode(ttf))

if __name__ == '__main__':
    data = get_data()
    get_font(data)

 

解析字体字符

import base64
from fontTools.ttLib import TTFont
import re
import requests
from lxml import etree

def get_data():
    url = "https://bj.58.com/chuzu/?PGTID=0d200001-0000-11e9-58e6-a658f219b27c&ClickID=1"
    headers = {
        'authority': 'bj.58.com',
        'method': 'GET',
        'path': '/chuzu/?PGTID=0d200001-0000-11e9-58e6-a658f219b27c&ClickID=1',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': 'f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=bj%7C%E5%8C%97%E4%BA%AC%7C0; userid360_xml=C0E9739B2022549506AFBC01231A1DAA; time_create=1606640420140; xxzl_cid=f4a781439d9247f393d0a1629bec00df; xzuid=e0e5ea78-ac5a-491b-819d-a869ab37a7a7; xxzl_deviceid=2G3xFS3qwOviMHxtC%2FVEituhpmiI%2FJ%2BAmJ08cPBulZSe7LcSgT98WgFcyNDbzMXJ; id58=c5/nfF+bz1xVS0tAA7tjAg==; 58tj_uuid=116f1ed0-7c25-477e-8887-be3602fa2389; new_uv=1; utm_source=; spm=; init_refer=https%253A%252F%252Fbj.58.com%252Fchuzu%252Fsub%252Fpn70%252F%253Fpagetype%253Dditie%2526PGTID%253D0d3090a7-0000-1b87-3e2e-c6efe8d19973%2526ClickID%253D2; wmda_uuid=13712f08f0e555f110b1b2684ce9d709; wmda_new_uuid=1; wmda_session_id_11187958619315=1604046685879-d3ad7e5f-77f6-29d7; wmda_visited_projects=%3B11187958619315; als=0; f=n; new_session=0',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    return response.text

def get_font(data):
    html = etree.HTML(data)
    script = html.xpath('//script[2]/text()')[0]
    ttf = re.findall(".*?src:url('data:application/font-ttf;charset=utf-8;base64,(.*?)').*?",script,re.S)[0]
    with open('fangchan-secret.ttf','wb') as f:
        f.write(base64.b64decode(ttf))

def parse_font():
    font = TTFont('fangchan-secret.ttf')
    bestcmap = font['cmap'].getBestCmap()
    newmap = dict()
    for key in bestcmap.keys():
        value = int(re.search(r'(d+)', bestcmap[key]).group(1)) - 1
        key = hex(key)
        newmap[key] = value
    print(newmap)

if __name__ == '__main__':
    data = get_data()
    get_font(data)
    parse_font()

 


我们发现字体编号和之前的不符合,比如:0x9476=7,而这里的是2,这是什么原因呢?是因为他的字体是动态生成的,每次返回的数字编号对应的值都是不同的,但是不影响我们代码的正常运行与结果。

 

import base64
from fontTools.ttLib import TTFont
import re
import requests
from lxml import etree

def get_data():
    url = "https://bj.58.com/chuzu/?PGTID=0d200001-0000-11e9-58e6-a658f219b27c&ClickID=1"
    headers = {
        'authority': 'bj.58.com',
        'method': 'GET',
        'path': '/chuzu/?PGTID=0d200001-0000-11e9-58e6-a658f219b27c&ClickID=1',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': 'f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=bj%7C%E5%8C%97%E4%BA%AC%7C0; userid360_xml=C0E9739B2022549506AFBC01231A1DAA; time_create=1606640420140; xxzl_cid=f4a781439d9247f393d0a1629bec00df; xzuid=e0e5ea78-ac5a-491b-819d-a869ab37a7a7; xxzl_deviceid=2G3xFS3qwOviMHxtC%2FVEituhpmiI%2FJ%2BAmJ08cPBulZSe7LcSgT98WgFcyNDbzMXJ; id58=c5/nfF+bz1xVS0tAA7tjAg==; 58tj_uuid=116f1ed0-7c25-477e-8887-be3602fa2389; new_uv=1; utm_source=; spm=; init_refer=https%253A%252F%252Fbj.58.com%252Fchuzu%252Fsub%252Fpn70%252F%253Fpagetype%253Dditie%2526PGTID%253D0d3090a7-0000-1b87-3e2e-c6efe8d19973%2526ClickID%253D2; wmda_uuid=13712f08f0e555f110b1b2684ce9d709; wmda_new_uuid=1; wmda_session_id_11187958619315=1604046685879-d3ad7e5f-77f6-29d7; wmda_visited_projects=%3B11187958619315; als=0; f=n; new_session=0',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    return response.text

def get_font(data):
    html = etree.HTML(data)
    script = html.xpath('//script[2]/text()')[0]
    ttf = re.findall(".*?src:url('data:application/font-ttf;charset=utf-8;base64,(.*?)').*?",script,re.S)[0]
    with open('fangchan-secret.ttf','wb') as f:
        f.write(base64.b64decode(ttf))

def parse_font():
    font = TTFont('fangchan-secret.ttf')
    bestcmap = font['cmap'].getBestCmap()
    newmap = dict()
    for key in bestcmap.keys():
        value = int(re.search(r'(d+)', bestcmap[key]).group(1)) - 1
        key = hex(key)
        newmap[key] = value
    return newmap

def parse_data(data,newmap):
    for key,value in newmap.items():
        key_ = key.replace('0x','&#x') + ';'
        if key_ in data:
            data = data.replace(key_,str(value))
    html = etree.HTML(data)
    house_list = html.xpath('//ul[@class="house-list"]/li')[:-1]
    for house in house_list:
        room = house.xpath('.//p[@class="room"]/text()')[0]
        money = house.xpath('.//b[@class="strongbox"]/text()')[0]
        print(room,money)

if __name__ == '__main__':
    data = get_data()
    get_font(data)
    newmap = parse_font()
    parse_data(data,newmap)

 

声明:本文仅供学习交流使用,请勿用于商业用途,违者后果自负。

 收藏 (0) 打赏

您可以选择一种方式赞助本站

支付宝扫一扫赞助

微信钱包扫描赞助

未经允许不得转载:英协网 » Python爬取58同城租房数据 完美解决字体加密问题

分享到: 生成海报
avatar

热门文章

  • 评论 抢沙发

    • QQ号
    • 昵称 (必填)
    • 邮箱 (必填)
    • 网址

    登录

    忘记密码 ?

    切换登录

    注册

    我们将发送一封验证邮件至你的邮箱, 请正确填写以完成账号注册和激活