fxsjy / jparser Goto Github PK

View Code? Open in Web Editor NEW

86.0 86.0 24.0 20 KB

A readability parser which can extract title, content, images from html pages

License: MIT License

Python 91.45% HTML 8.55%

jparser's People

Contributors

Stargazers

Watchers

jparser's Issues

在Python3中计算每种tag字数的时候，会发生cyfunction不能和str比较

需要加一个判断
在model.py文件中需要加入：
if not isinstance(t, str):
continue

import re
import lxml
import lxml.html
import urllib.parse

from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region

class PageModel(object):
def init(self, page, url=""):
assert isinstance(page, str)
for tag in ['style', 'script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')

def extract_content(self, region):
    items = region.xpath('.//text()|.//img|./table')
    tag_hist = {}
    for item in items:
        if hasattr(item, 'tag'):
            continue
        t = item.getparent().tag
        if not isinstance(t, str):
            continue
        if t not in tag_hist:
            tag_hist[t] = 0
        tag_hist[t] += len(item.strip())
    winner_tag = None
    if len(tag_hist) > 0:
        winner_tag = max((c, k) for k, c in tag_hist.items())[1]
    contents = []
    for item in items:
        if not hasattr(item, 'tag'):
            txt = item.strip()
            parent_tag = item.getparent().tag
            if parent_tag != winner_tag \
                    and len(self.stripper.sub("", txt)) < self.impurity_threshold \
                    and parent_tag != 'li':
                continue
            if txt == "":
                continue
            contents.append({"type": "text", "data": txt})
        elif item.tag == 'table':
            if winner_tag == 'td':
                continue
            if item != region:
                for el in item.xpath(".//a"):
                    el.drop_tag()
                table_s = lxml.html.tostring(item)
                contents.append({"type": "html", "data": table_s})
            else:
                for sub_item in item.xpath("//td/text()"):
                    contents.append({"type": "text", "data": sub_item})
        elif item.tag == 'img':
            for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
                src = item.get(img_prop)
                if src != None:
                    break
            if self.url != "":
                if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
                    src = "/" + src
                src = urllib.parse.urljoin(self.url, src, False)
            contents.append({"type": "image", "data": {"src": src}})
        else:
            pass
    return contents

def extract_title(self):
    doc = self.doc
    tag_title = doc.xpath("/html/head/title/text()")
    s_tag_title = "".join(re.split(r'_|-', "".join(tag_title))[:1])
    title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
    for c_title in title_candidates:
        c_title = c_title.strip()
        if c_title != "" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
            return c_title
    sort_by_len_list = sorted((-1 * len(x.strip()), x) for x in ([s_tag_title] + title_candidates))
    return sort_by_len_list[0][1]

def extract(self):
    title = self.extract_title()
    region = self.region.locate()
    if region == None:
        return {'title': '', 'content': []}
    rm_tag_set = set([])
    for p_el in region.xpath(".//p|.//li"):
        child_links = p_el.xpath(".//a/text()")
        count_p = len(" ".join(p_el.xpath(".//text()")))
        count_a = len(" ".join(child_links))
        if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
            p_el.drop_tree()
    for el in region.xpath(".//a"):
        rm_tag_set.add(el)
    for el in region.xpath(".//strong|//b"):
        rm_tag_set.add(el)
    for el in rm_tag_set:
        el.drop_tag()
    content = self.extract_content(region)
    return {"title": title, "content": content}

库不支持python3

model.py 修改为

`
#!/bin/env python
#encoding=utf-8
import re
import lxml
import lxml.html
import urllib
from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region

class PageModel(object):
def init(self, page, url = ""):
assert type(page) is str
for tag in ['style','script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')

def extract_content(self, region):
    items = region.xpath('.//text()|.//img|./table')
    tag_hist = {}
    for item in items:
        if  hasattr(item,'tag'):
            continue
        t = item.getparent().tag
        if t not in tag_hist:
            tag_hist[t] = 0
        tag_hist[t] += len(item.strip())
    winner_tag = None
    if len(tag_hist) > 0:
        winner_tag = max((c,k) for k,c in tag_hist.items())[1]
    contents = []
    for item in items:
        if not hasattr(item,'tag'):
            txt = item.strip()
            parent_tag = item.getparent().tag
            if  parent_tag != winner_tag \
                and len(self.stripper.sub("",txt)) < self.impurity_threshold \
                and parent_tag != 'li':
                continue
            contents.append({"type":"text","data":txt})
        elif item.tag == 'table':
            if winner_tag == 'td':
                continue
            if item != region:
                for el in item.xpath(".//a"):
                    el.drop_tag()
                table_s = lxml.html.tostring(item)
                contents.append({"type":"html","data":table_s})
            else:
                for sub_item in item.xpath("//td/text()"):
                    contents.append({"type":"text","data":sub_item})
        elif item.tag == 'img':
            for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
                src =  item.get(img_prop)
                if src != None:
                    break
            if self.url != "":
                if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
                    src = "/" + src
                src = urlparse.urljoin(self.url, src, False)
            contents.append({"type":"image","data":{"src": src}})    
        else:
            pass   
    return contents

def extract_title(self):
    doc = self.doc
    tag_title = doc.xpath("/html/head/title/text()")
    s_tag_title = "".join(re.split(r'_|-',"".join(tag_title))[:1])
    title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
    for c_title in title_candidates:
        c_title = c_title.strip()
        if c_title!="" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
            return c_title
    sort_by_len_list = sorted((-1*len(x.strip()),x) for x in ([s_tag_title] + title_candidates))
    return sort_by_len_list[0][1]

def extract(self):
    title = self.extract_title()
    region = self.region.locate()
    if region == None:
        return {'title':'', 'content':[]}
    rm_tag_set = set([])
    for p_el in region.xpath(".//p|.//li"):
        child_links = p_el.xpath(".//a/text()")
        count_p = len(" ".join(p_el.xpath(".//text()")))
        count_a = len(" ".join(child_links))
        if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
            p_el.drop_tree()
    for el in region.xpath(".//a"):
        rm_tag_set.add(el)
    for el in region.xpath(".//strong|//b"):
        rm_tag_set.add(el)
    for el in rm_tag_set:
        el.drop_tag()
    content = self.extract_content(region)
    return {"title":title , "content": content}

网页正文去除噪声数据

你好，我最近也在做相关工作，一般的网页正文都是有很多多余的噪声数据，需要去除，这块有考虑后面加吗

fxsjy / jparser Goto Github PK

jparser's People

Contributors

Stargazers

Watchers

Forkers

jparser's Issues

在Python3中计算每种tag字数的时候，会发生cyfunction不能和str比较

库不支持python3

网页正文去除噪声数据

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent