Giter Site home page Giter Site logo

Comments (5)

kof0012 avatar kof0012 commented on May 18, 2024
import hashlib
import json
import time

import pymysql
import requests
from fake_useragent import UserAgent
from requests.exceptions import RequestException
import trip


ua = UserAgent()
s = requests.session()


def getASCP():
    t = round(time.time())
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS, CP

    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'A1' + s + e[-3:]
    CP = e[0:3] + r + 'E1'
    return AS, CP

@trip.coroutine
def start_requests(maxtime=0):
    AS, CP = getASCP()
    headers = {'User-Agent': ua.random}
    feed_url = 'https://www.toutiao.com/api/pc/feed/'
    payloads = {'max_behot_time': maxtime, 'category': '__all__', 'utm_source': 'toutiao', 'widen': 1,
                'tadrequire': 'false', 'as': AS, 'cp': CP}
    global r
    try:
        r =yield trip.get(feed_url, params=payloads, headers=headers)

        r_co=r.content
        r.encoding="utf-8"
        print(r.encoding)
        r_js=r.json()
        if 'data' in r.keys():
            return r_js
    except RequestException as e:
        print('请求不成功', e)
        return None


async def parse_detail(response):
    for i in response.get('data', None):
        if i.get('is_feed_ad') == False:
            title = i.get('title')
            tags = i.get('chinese_tag')
            comments = i.get('comments_count')
            result = {'title': title, 'tags': tags, 'comments': comments}
            print(result)
            insert_mysql(result)
    if response.get('next'):
        maxtime = response.get('next').get('max_behot_time')

        return parse_detail(trip.run(start_requests(maxtime=maxtime)))


def write_json(result):
    with open('tt.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')


def insert_mysql(result):
    try:
        conn = pymysql.Connect(host="127.0.0.1", port=3306, user='root', passwd='root', db='spider', charset='utf8')
        cursor = conn.cursor()
        sql_in = "replace into lala (title,tags,comments) values(%s,%s,%s)"
        cursor.execute(sql_in, (result['title'], result['tags'], result['comments']))
        conn.commit()
    except Exception as  e:
        print(e)
        conn.rollback()


def main():
    response = trip.run(start_requests)

    trip.run(parse_detail(response))


if __name__ == '__main__':
    main()

from trip.

littlecodersh avatar littlecodersh commented on May 18, 2024

@kof0012 是我gzip处理的时候的问题,你更新一下版本(0.0.3)即可。

python -m pip install trip -U

from trip.

kof0012 avatar kof0012 commented on May 18, 2024

@littlecodersh 感谢回复,另外请问怎么在trip.run(fun)里写参数。实现trip.run(fun(args)),还是抓今日头条的json文件,想要递归回调(带参数),想了半天想不出来办法。。求教。

import trip
import hashlib
import json
import time
import pymysql
import requests
from fake_useragent import UserAgent
from requests.exceptions import RequestException

ua = UserAgent()

conn = pymysql.Connect(host="127.0.0.1", port=3306,
                       user='root', passwd='root', db='spider', charset='utf8')
cursor = conn.cursor()


def getASCP():
    t = round(time.time())
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS, CP

    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'A1' + s + e[-3:]
    CP = e[0:3] + r + 'E1'
    return AS, CP

@trip.coroutine
def start_requests(maxtime=0):
    global r
    AS, CP = getASCP()
    headers = {'User-Agent': ua.random}
    feed_url = 'https://www.toutiao.com/api/pc/feed/'
    payloads = {'max_behot_time': maxtime, 'category': '__all__', 'utm_source': 'toutiao', 'widen': 1,
                'tadrequire': 'false', 'as': AS, 'cp': CP}
    try:
        r =yield trip.get(feed_url, params=payloads, headers=headers)
        res=r.json()
    except RequestException as e:
        print('请求不成功', e)
        return None


def parse_detail(response):
    global ss
    for i in response.get('data', None):
        if i.get('is_feed_ad') == False:
            result = {'title': i.get('title'), 'tags': i.get('chinese_tag'), 'comments': i.get(
                'comments_count'), 'url': 'https://www.toutiao.com' + i.get('source_url')}
            print(result)
            insert_mysql(result)


    if response.get('next'):
        maxtime = response.get('next').get('max_behot_time')
        ss=start_requests(maxtime=maxtime)
        return parse_detail(ss.json())



def write_json(result):
    with open('tt.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')


def insert_mysql(result):
    try:

        sql_in = "insert into toutiaocomment(title,tags,comments,url) VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE comments=VALUES(comments)"
        cursor.execute(
            sql_in, (result['title'], result['tags'], result['comments'], result['url']))
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()

def main():
    trip.run(start_requests)
    parse_detail(r.json())



if __name__ == '__main__':
    main()

from trip.

littlecodersh avatar littlecodersh commented on May 18, 2024

@kof0012

from functools import partial

from trip.

kof0012 avatar kof0012 commented on May 18, 2024

@littlecodersh 多谢提醒,已经解决。

from trip.

Related Issues (17)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.