from lxml import etree
def get_html_tree(html:str):
def get_tree(tree, tree_pic=[]):
'''获取网页html结构
'''
for elemt in tree:
tag = elemt.tag
if isinstance(tag, str):
# 递归遍历每个标签及其子标签
inner = get_tree(elemt, tree_pic=[tag])
if len(inner) > 1:
# 包含子标签
tree_pic.append(inner)
else:
# 不包含
tree_pic.extend(inner)
return tree_pic
def analysis_tree(tree):
'''分析网页结构
'''
# 每次递归都应统计标签或模块的重复次数,
tree_pic = [1]
# 扫描方式:网页结构自上往下,网页结构中的模块由内向外(不断重复,自上往下 --> 由内向外)
for elemt in tree:
if isinstance(elemt, list):
# 递归遍历每个标签及其子标签
inner = analysis_tree(elemt)
tree_pic = ana_block(tree_pic, inner)
else:
# 没有子标签
tree_pic = ana_block(tree_pic, elemt)
add(tree_pic)
tree_pic.remove(tree_pic[-1])
tree_pic = ana_tag(tree_pic)
return tree_pic
def add(tree_pic):
'''当连续重复的标签或模块大于1时,使用 “<标签|模块> * 重复次数” 表示
'''
if tree_pic[-1] > 1:
tag = tree_pic[-2]
tree_pic.insert(-1, f'{str(tag)}*{tree_pic[-1]}')
tree_pic.remove(tag)
tree_pic[-1] = 1
return tree_pic
def ana_block(tree_pic, obj):
'''统计连续重复的标签或模块
'''
if len(tree_pic) > 1:
# 连续重复的标签或模块
if tree_pic[-2] == obj:
tree_pic[-1] += 1
else:
# 连续重复被打断
tree_pic = add(tree_pic)
# 把中断连续的那个标签加上
tree_pic.insert(-1, obj)
else:
tree_pic.insert(-1, obj)
return tree_pic
def ana_tag(tree_pic):
'''统计间隔重复的标签
'''
ana_lst, tmp = [], []
interval, repeat, repeat_len = 0, 1, 0
for tag in tree_pic:
if isinstance(tag, str):
tmp.append(tag)
if repeat > 1:
interval += 1
# 判断是否到了下一个间隔长度
if interval == repeat_len:
interval = 0
# 判断下一个间隔是否重复
if tmp[:repeat_len] == tmp[repeat_len:]:
# 重复 +1
repeat += 1
tmp = tmp[:repeat_len]
# 重复被打断
else:
ana_lst.append(f"<{','.join(tmp[:repeat_len])}>*{repeat}")
tmp = tmp[repeat_len:]
repeat = 1
else:
# 间隔长度至少为 2,对比*2
if len(tmp) >= 4:
for i in range(2, len(tmp)//2+1):
# 获取间隔重复的标签,及间隔长度
if tmp[-i:] == tmp[-i*2:-i]:
repeat += 1
ana_lst += tmp[:-2*i]
tmp = tmp[-i:]
repeat_len = len(tmp)
break
else:
# 重复被打断
if repeat == 1:
ana_lst.extend(tmp)
tmp = []
elif tmp[:repeat_len] and repeat > 1:
ana_lst.append(f"<{', '.join(tmp[:repeat_len])}>*{repeat}")
ana_lst.extend(tmp[repeat_len:])
tmp = []
repeat = 1
interval = 0
ana_lst.append(tag)
if tmp and repeat > 1:
ana_lst.append(f"<{','.join(tmp[:repeat_len])}>*{repeat}")
ana_lst.extend(tmp[repeat_len:])
tmp = []
if tmp:
ana_lst.extend(tmp)
if ana_lst:
return ana_lst
else:
return tree_pic
tree = etree.HTML(html)
if tree is not None:
tree = get_tree(tree)
tree = analysis_tree(tree)
else:
tree = None
return tree
if __name__ == '__main__':
pass
0 评论
大哥整点话呗~