Giter Site home page Giter Site logo

dementor's People

Contributors

kaiserkatze avatar

Watchers

 avatar

dementor's Issues

Add support for scraping data from Bank of China

def send_request(method='POST'):
    u"""
    目标 URL
    方法:POST
    数据:
    randCode: e172ev8m # 反爬随机数
    scount: 5 
    cPage: 6 # 页码
    articleField01: # 商品代码(查询筛选
    articleField02: # 商品名称(查询筛选
    """

    url = "http://hd.chinatax.gov.cn/fagui/action/InitChukou.do"

def get_hidden_input_value(input_name: str):
    u"""隐藏输入域"""

    # 选取节点
    node = soup.select_one(f'input[type="hidden"][name="{input_name}"]')
    # 获取输入域的取值
    input_value = node.get('value')

def generate_request():
    u"""生成请求"""

    randCode = get_hidden_input_value('randCode')
    scount = get_hidden_input_value('scount')

def get_page_info():
    u"""获取页码"""

    # 选取父节点
    node = soup.select_one('#searchForm .sv_hei table:nth-of-type(1) td')
    # TODO 获取第一个节点,断言该节点是文本节点
    node = node.contents[0]
    text = node.string
    # TODO 断言该文本节点内容形如(共10729条 716页 5/716)
    pattern = r'共\d+条 (\d+)页 (\d+)/\d+'
    prog = re.compile(pattern)
    match_result = re.match(prog, text)
    if not match_result:
        return

    # 返回 tuple (总页数,当前页码)
    return match_result.group(1), match_result.group(2)

def parse_table():
    u"""解析表格"""

    # 表格有 4 列,列名分别是:
    # 商品编码、商品名称、计量单位、增值税退税率%

    # 选取表格中所有行
    nodes = soup.select('#searchForm .sv_hei table:nth-of-type(2) tr')
    # 去掉第一行
    nodes = nodes[1:]
    for node in nodes:
        # 同一行的所有单元格
        entries = node.children
        # 提取单元格中的文本
        row = list(map(lambda entry: entry.string, entries))
        # 生成 Series
        row = pd.Series(row)
        # TODO 插入 DataFrame 中

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.