Giter Site home page Giter Site logo

jparser's People

Contributors

fxsjy avatar qjfoidnh avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

jparser's Issues

在Python3中计算每种tag字数的时候,会发生cyfunction不能和str比较

需要加一个判断
在model.py文件中需要加入:
if not isinstance(t, str):
continue

`

import re
import lxml
import lxml.html
import urllib.parse

from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region

class PageModel(object):
def init(self, page, url=""):
assert isinstance(page, str)
for tag in ['style', 'script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')

def extract_content(self, region):
    items = region.xpath('.//text()|.//img|./table')
    tag_hist = {}
    for item in items:
        if hasattr(item, 'tag'):
            continue
        t = item.getparent().tag
        if not isinstance(t, str):
            continue
        if t not in tag_hist:
            tag_hist[t] = 0
        tag_hist[t] += len(item.strip())
    winner_tag = None
    if len(tag_hist) > 0:
        winner_tag = max((c, k) for k, c in tag_hist.items())[1]
    contents = []
    for item in items:
        if not hasattr(item, 'tag'):
            txt = item.strip()
            parent_tag = item.getparent().tag
            if parent_tag != winner_tag \
                    and len(self.stripper.sub("", txt)) < self.impurity_threshold \
                    and parent_tag != 'li':
                continue
            if txt == "":
                continue
            contents.append({"type": "text", "data": txt})
        elif item.tag == 'table':
            if winner_tag == 'td':
                continue
            if item != region:
                for el in item.xpath(".//a"):
                    el.drop_tag()
                table_s = lxml.html.tostring(item)
                contents.append({"type": "html", "data": table_s})
            else:
                for sub_item in item.xpath("//td/text()"):
                    contents.append({"type": "text", "data": sub_item})
        elif item.tag == 'img':
            for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
                src = item.get(img_prop)
                if src != None:
                    break
            if self.url != "":
                if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
                    src = "/" + src
                src = urllib.parse.urljoin(self.url, src, False)
            contents.append({"type": "image", "data": {"src": src}})
        else:
            pass
    return contents

def extract_title(self):
    doc = self.doc
    tag_title = doc.xpath("/html/head/title/text()")
    s_tag_title = "".join(re.split(r'_|-', "".join(tag_title))[:1])
    title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
    for c_title in title_candidates:
        c_title = c_title.strip()
        if c_title != "" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
            return c_title
    sort_by_len_list = sorted((-1 * len(x.strip()), x) for x in ([s_tag_title] + title_candidates))
    return sort_by_len_list[0][1]

def extract(self):
    title = self.extract_title()
    region = self.region.locate()
    if region == None:
        return {'title': '', 'content': []}
    rm_tag_set = set([])
    for p_el in region.xpath(".//p|.//li"):
        child_links = p_el.xpath(".//a/text()")
        count_p = len(" ".join(p_el.xpath(".//text()")))
        count_a = len(" ".join(child_links))
        if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
            p_el.drop_tree()
    for el in region.xpath(".//a"):
        rm_tag_set.add(el)
    for el in region.xpath(".//strong|//b"):
        rm_tag_set.add(el)
    for el in rm_tag_set:
        el.drop_tag()
    content = self.extract_content(region)
    return {"title": title, "content": content}

`

库不支持python3

model.py 修改为

`
#!/bin/env python
#encoding=utf-8
import re
import lxml
import lxml.html
import urllib
from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region

class PageModel(object):
def init(self, page, url = ""):
assert type(page) is str
for tag in ['style','script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')

def extract_content(self, region):
    items = region.xpath('.//text()|.//img|./table')
    tag_hist = {}
    for item in items:
        if  hasattr(item,'tag'):
            continue
        t = item.getparent().tag
        if t not in tag_hist:
            tag_hist[t] = 0
        tag_hist[t] += len(item.strip())
    winner_tag = None
    if len(tag_hist) > 0:
        winner_tag = max((c,k) for k,c in tag_hist.items())[1]
    contents = []
    for item in items:
        if not hasattr(item,'tag'):
            txt = item.strip()
            parent_tag = item.getparent().tag
            if  parent_tag != winner_tag \
                and len(self.stripper.sub("",txt)) < self.impurity_threshold \
                and parent_tag != 'li':
                continue
            contents.append({"type":"text","data":txt})
        elif item.tag == 'table':
            if winner_tag == 'td':
                continue
            if item != region:
                for el in item.xpath(".//a"):
                    el.drop_tag()
                table_s = lxml.html.tostring(item)
                contents.append({"type":"html","data":table_s})
            else:
                for sub_item in item.xpath("//td/text()"):
                    contents.append({"type":"text","data":sub_item})
        elif item.tag == 'img':
            for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
                src =  item.get(img_prop)
                if src != None:
                    break
            if self.url != "":
                if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
                    src = "/" + src
                src = urlparse.urljoin(self.url, src, False)
            contents.append({"type":"image","data":{"src": src}})    
        else:
            pass   
    return contents

def extract_title(self):
    doc = self.doc
    tag_title = doc.xpath("/html/head/title/text()")
    s_tag_title = "".join(re.split(r'_|-',"".join(tag_title))[:1])
    title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
    for c_title in title_candidates:
        c_title = c_title.strip()
        if c_title!="" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
            return c_title
    sort_by_len_list = sorted((-1*len(x.strip()),x) for x in ([s_tag_title] + title_candidates))
    return sort_by_len_list[0][1]

def extract(self):
    title = self.extract_title()
    region = self.region.locate()
    if region == None:
        return {'title':'', 'content':[]}
    rm_tag_set = set([])
    for p_el in region.xpath(".//p|.//li"):
        child_links = p_el.xpath(".//a/text()")
        count_p = len(" ".join(p_el.xpath(".//text()")))
        count_a = len(" ".join(child_links))
        if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
            p_el.drop_tree()
    for el in region.xpath(".//a"):
        rm_tag_set.add(el)
    for el in region.xpath(".//strong|//b"):
        rm_tag_set.add(el)
    for el in rm_tag_set:
        el.drop_tag()
    content = self.extract_content(region)
    return {"title":title , "content": content}

`

网页正文去除噪声数据

你好,我最近也在做相关工作,一般的网页正文都是有很多多余的噪声数据,需要去除,这块有考虑后面加吗

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.