58同城房产字体反爬

获取HTML

url = "https://sz.58.com/zufang/"

UA = {
	"referer": "https://www.google.com/",
	"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}

html = requests.get(url=url, headers=UA)
html.encoding="utf-8"

ret = html.text
with open('58-fangchan.txt', "w", encoding="utf-8") as f:
	f.write(ret)

获取字体文件


# 1. 获取字体文件，
with open("58-fangchan.txt", "r", encoding='utf-8') as f:
	content = f.read()

font_face = re.search(r"base64,(.*?)'\)", content).group(1)

转换 xml，分析关系

from fontTools.ttLib import TTFont

font = TTFont("./b-58fangchan字体.ttf")
font.saveXML("b-58fangchan字体.xml")

字体的映射关系

1	通过，我们反复分析，得出数字取出减1

全部代码

import base64
import io
import re
from lxml import etree

from fontTools.ttLib import TTFont

import requests

# ret = base64.b64decode(font_face)
# with open('58fangchan.ttf', 'wb') as f:
# 	f.write(ret)

# font = TTFont(ret)
# font.saveXML('b-58.xml')



url = "https://sz.58.com/zufang/"

UA = {
	"referer": "https://www.google.com/",
	"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}

# html = requests.get(url=url, headers=UA)
# html.encoding="utf-8"
#
# ret = html.text
# with open('58-fangchan.txt', "w", encoding="utf-8") as f:
# 	f.write(ret)


# 1. 获取字体文件，
with open("58-fangchan.txt", "r", encoding='utf-8') as f:
	content = f.read()

font_face = re.search(r"base64,(.*?)'\)", content).group(1)
# print(font_face)

# 2. base64 解码
ret = base64.b64decode(font_face)

font = TTFont(io.BytesIO(ret))

bestcmap = font['cmap'].getBestCmap()

for k, v in bestcmap.items():
	# print(k)
	# print(v)
	# k 此时 是10 进制 ，转成 16进制
	k = hex(k)
	k = k.replace('0x', '&#x') + ";"
	# 通过，分析得出，取出v， -1 就对应数字
	v = int(re.search(r'(\d+)', v).group(0)) -1
	print(k)
	print(v)
	if k in content:
		content = content.replace(k, str(v))
#
# print(content)
# # 3 .获取 标题 ，
resp = etree.HTML(content)

lis = resp.xpath("//ul[@class='listUl']/li")
for li in lis:
    title = li.xpath('./div[@class="des"]/h2/a/text()')
    if title:
        title = title[0]
        print(title)

58同城 房产字体 反爬

58同城房产字体反爬