fxsjy / jparser Goto Github PK
View Code? Open in Web Editor NEWA readability parser which can extract title, content, images from html pages
License: MIT License
A readability parser which can extract title, content, images from html pages
License: MIT License
需要加一个判断
在model.py文件中需要加入:
if not isinstance(t, str):
continue
`
import re
import lxml
import lxml.html
import urllib.parse
from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region
class PageModel(object):
def init(self, page, url=""):
assert isinstance(page, str)
for tag in ['style', 'script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')
def extract_content(self, region):
items = region.xpath('.//text()|.//img|./table')
tag_hist = {}
for item in items:
if hasattr(item, 'tag'):
continue
t = item.getparent().tag
if not isinstance(t, str):
continue
if t not in tag_hist:
tag_hist[t] = 0
tag_hist[t] += len(item.strip())
winner_tag = None
if len(tag_hist) > 0:
winner_tag = max((c, k) for k, c in tag_hist.items())[1]
contents = []
for item in items:
if not hasattr(item, 'tag'):
txt = item.strip()
parent_tag = item.getparent().tag
if parent_tag != winner_tag \
and len(self.stripper.sub("", txt)) < self.impurity_threshold \
and parent_tag != 'li':
continue
if txt == "":
continue
contents.append({"type": "text", "data": txt})
elif item.tag == 'table':
if winner_tag == 'td':
continue
if item != region:
for el in item.xpath(".//a"):
el.drop_tag()
table_s = lxml.html.tostring(item)
contents.append({"type": "html", "data": table_s})
else:
for sub_item in item.xpath("//td/text()"):
contents.append({"type": "text", "data": sub_item})
elif item.tag == 'img':
for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
src = item.get(img_prop)
if src != None:
break
if self.url != "":
if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
src = "/" + src
src = urllib.parse.urljoin(self.url, src, False)
contents.append({"type": "image", "data": {"src": src}})
else:
pass
return contents
def extract_title(self):
doc = self.doc
tag_title = doc.xpath("/html/head/title/text()")
s_tag_title = "".join(re.split(r'_|-', "".join(tag_title))[:1])
title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
for c_title in title_candidates:
c_title = c_title.strip()
if c_title != "" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
return c_title
sort_by_len_list = sorted((-1 * len(x.strip()), x) for x in ([s_tag_title] + title_candidates))
return sort_by_len_list[0][1]
def extract(self):
title = self.extract_title()
region = self.region.locate()
if region == None:
return {'title': '', 'content': []}
rm_tag_set = set([])
for p_el in region.xpath(".//p|.//li"):
child_links = p_el.xpath(".//a/text()")
count_p = len(" ".join(p_el.xpath(".//text()")))
count_a = len(" ".join(child_links))
if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
p_el.drop_tree()
for el in region.xpath(".//a"):
rm_tag_set.add(el)
for el in region.xpath(".//strong|//b"):
rm_tag_set.add(el)
for el in rm_tag_set:
el.drop_tag()
content = self.extract_content(region)
return {"title": title, "content": content}
`
model.py 修改为
`
#!/bin/env python
#encoding=utf-8
import re
import lxml
import lxml.html
import urllib
from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region
class PageModel(object):
def init(self, page, url = ""):
assert type(page) is str
for tag in ['style','script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')
def extract_content(self, region):
items = region.xpath('.//text()|.//img|./table')
tag_hist = {}
for item in items:
if hasattr(item,'tag'):
continue
t = item.getparent().tag
if t not in tag_hist:
tag_hist[t] = 0
tag_hist[t] += len(item.strip())
winner_tag = None
if len(tag_hist) > 0:
winner_tag = max((c,k) for k,c in tag_hist.items())[1]
contents = []
for item in items:
if not hasattr(item,'tag'):
txt = item.strip()
parent_tag = item.getparent().tag
if parent_tag != winner_tag \
and len(self.stripper.sub("",txt)) < self.impurity_threshold \
and parent_tag != 'li':
continue
contents.append({"type":"text","data":txt})
elif item.tag == 'table':
if winner_tag == 'td':
continue
if item != region:
for el in item.xpath(".//a"):
el.drop_tag()
table_s = lxml.html.tostring(item)
contents.append({"type":"html","data":table_s})
else:
for sub_item in item.xpath("//td/text()"):
contents.append({"type":"text","data":sub_item})
elif item.tag == 'img':
for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
src = item.get(img_prop)
if src != None:
break
if self.url != "":
if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
src = "/" + src
src = urlparse.urljoin(self.url, src, False)
contents.append({"type":"image","data":{"src": src}})
else:
pass
return contents
def extract_title(self):
doc = self.doc
tag_title = doc.xpath("/html/head/title/text()")
s_tag_title = "".join(re.split(r'_|-',"".join(tag_title))[:1])
title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
for c_title in title_candidates:
c_title = c_title.strip()
if c_title!="" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
return c_title
sort_by_len_list = sorted((-1*len(x.strip()),x) for x in ([s_tag_title] + title_candidates))
return sort_by_len_list[0][1]
def extract(self):
title = self.extract_title()
region = self.region.locate()
if region == None:
return {'title':'', 'content':[]}
rm_tag_set = set([])
for p_el in region.xpath(".//p|.//li"):
child_links = p_el.xpath(".//a/text()")
count_p = len(" ".join(p_el.xpath(".//text()")))
count_a = len(" ".join(child_links))
if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
p_el.drop_tree()
for el in region.xpath(".//a"):
rm_tag_set.add(el)
for el in region.xpath(".//strong|//b"):
rm_tag_set.add(el)
for el in rm_tag_set:
el.drop_tag()
content = self.extract_content(region)
return {"title":title , "content": content}
`
你好,我最近也在做相关工作,一般的网页正文都是有很多多余的噪声数据,需要去除,这块有考虑后面加吗
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.