0%

58同城 房产字体 反爬

58同城 房产字体 反爬

58同城 房产字体 反爬

  1. 获取HTML

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    url = "https://sz.58.com/zufang/"

    UA = {
    "referer": "https://www.google.com/",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
    }

    html = requests.get(url=url, headers=UA)
    html.encoding="utf-8"

    ret = html.text
    with open('58-fangchan.txt', "w", encoding="utf-8") as f:
    f.write(ret)
  1. 获取字体文件

    1
    2
    3
    4
    5
    6

    # 1. 获取字体文件,
    with open("58-fangchan.txt", "r", encoding='utf-8') as f:
    content = f.read()

    font_face = re.search(r"base64,(.*?)'\)", content).group(1)
  1. 转换 xml, 分析关系

    1
    2
    3
    4
    5
    6
    from fontTools.ttLib import TTFont

    font = TTFont("./b-58fangchan字体.ttf")
    font.saveXML("b-58fangchan字体.xml")


  1. 字体的映射关系

    1
    通过,我们反复分析,得出数字取出 减1 
  2. 全部代码

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    import base64
    import io
    import re
    from lxml import etree

    from fontTools.ttLib import TTFont

    import requests

    # ret = base64.b64decode(font_face)
    # with open('58fangchan.ttf', 'wb') as f:
    # f.write(ret)

    # font = TTFont(ret)
    # font.saveXML('b-58.xml')



    url = "https://sz.58.com/zufang/"

    UA = {
    "referer": "https://www.google.com/",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
    }

    # html = requests.get(url=url, headers=UA)
    # html.encoding="utf-8"
    #
    # ret = html.text
    # with open('58-fangchan.txt', "w", encoding="utf-8") as f:
    # f.write(ret)


    # 1. 获取字体文件,
    with open("58-fangchan.txt", "r", encoding='utf-8') as f:
    content = f.read()

    font_face = re.search(r"base64,(.*?)'\)", content).group(1)
    # print(font_face)

    # 2. base64 解码
    ret = base64.b64decode(font_face)

    font = TTFont(io.BytesIO(ret))

    bestcmap = font['cmap'].getBestCmap()

    for k, v in bestcmap.items():
    # print(k)
    # print(v)
    # k 此时 是10 进制 ,转成 16进制
    k = hex(k)
    k = k.replace('0x', '&#x') + ";"
    # 通过,分析得出,取出v, -1 就对应数字
    v = int(re.search(r'(\d+)', v).group(0)) -1
    print(k)
    print(v)
    if k in content:
    content = content.replace(k, str(v))
    #
    # print(content)
    # # 3 .获取 标题 ,
    resp = etree.HTML(content)

    lis = resp.xpath("//ul[@class='listUl']/li")
    for li in lis:
    title = li.xpath('./div[@class="des"]/h2/a/text()')
    if title:
    title = title[0]
    print(title)
------ 本文结束------

欢迎关注我的其它发布渠道