58同城 房产字体 反爬
58同城 房产字体 反爬
获取HTML
1
2
3
4
5
6
7
8
9
10
11
12
13url = "https://sz.58.com/zufang/"
UA = {
"referer": "https://www.google.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
html = requests.get(url=url, headers=UA)
html.encoding="utf-8"
ret = html.text
with open('58-fangchan.txt', "w", encoding="utf-8") as f:
f.write(ret)
获取字体文件
1
2
3
4
5
6
# 1. 获取字体文件,
with open("58-fangchan.txt", "r", encoding='utf-8') as f:
content = f.read()
font_face = re.search(r"base64,(.*?)'\)", content).group(1)
转换 xml, 分析关系
1
2
3
4
5
6from fontTools.ttLib import TTFont
font = TTFont("./b-58fangchan字体.ttf")
font.saveXML("b-58fangchan字体.xml")
字体的映射关系
1
通过,我们反复分析,得出数字取出 减1
全部代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70import base64
import io
import re
from lxml import etree
from fontTools.ttLib import TTFont
import requests
# ret = base64.b64decode(font_face)
# with open('58fangchan.ttf', 'wb') as f:
# f.write(ret)
# font = TTFont(ret)
# font.saveXML('b-58.xml')
url = "https://sz.58.com/zufang/"
UA = {
"referer": "https://www.google.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
# html = requests.get(url=url, headers=UA)
# html.encoding="utf-8"
#
# ret = html.text
# with open('58-fangchan.txt', "w", encoding="utf-8") as f:
# f.write(ret)
# 1. 获取字体文件,
with open("58-fangchan.txt", "r", encoding='utf-8') as f:
content = f.read()
font_face = re.search(r"base64,(.*?)'\)", content).group(1)
# print(font_face)
# 2. base64 解码
ret = base64.b64decode(font_face)
font = TTFont(io.BytesIO(ret))
bestcmap = font['cmap'].getBestCmap()
for k, v in bestcmap.items():
# print(k)
# print(v)
# k 此时 是10 进制 ,转成 16进制
k = hex(k)
k = k.replace('0x', '&#x') + ";"
# 通过,分析得出,取出v, -1 就对应数字
v = int(re.search(r'(\d+)', v).group(0)) -1
print(k)
print(v)
if k in content:
content = content.replace(k, str(v))
#
# print(content)
# # 3 .获取 标题 ,
resp = etree.HTML(content)
lis = resp.xpath("//ul[@class='listUl']/li")
for li in lis:
title = li.xpath('./div[@class="des"]/h2/a/text()')
if title:
title = title[0]
print(title)