最近在学习react-native和QT

利用pyspider抓取金十新闻资讯

python jackton 466℃ 0评论

因为之前用golang语言编写的爬虫只能爬取静态页面,而最近金十资讯网站改版了,所有的数据都采用js动态渲染。只能用pyspider+phantomjs去抓取

一、安装python 2.7版本和pyspider 0.4.0版本 和 phantomjs

安装传送门:http://cuiqingcai.com/2443.html

二、编写代码

 

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-02-28 15:51:27
# Project: jinshi pyspider-0.4.0

from pyspider.libs.base_handler import *
import MySQLdb
class Handler(BaseHandler):
crawl_config = {
}

def __init__(self):
self.mysqldb = MySQLDB()

@every(minutes=24 * 60)
def on_start(self):
self.crawl(‘https://news.jin10.com/list.html?cate=1’, callback=self.index_page,save={‘type_str’:’1′,’name’:’国际’},fetch_type=’js’,validate_cert=False)
self.crawl(‘https://news.jin10.com/list.html?cate=4’, callback=self.index_page,save={‘type_str’:’4′,’name’:’原油’},fetch_type=’js’,validate_cert=False)
self.crawl(‘https://news.jin10.com/list.html?cate=5’, callback=self.index_page,save={‘type_str’:’5′,’name’:’贵金属’},fetch_type=’js’,validate_cert=False)
self.crawl(‘https://news.jin10.com/list.html?cate=6’, callback=self.index_page,save={‘type_str’:’6′,’name’:’央行’},fetch_type=’js’,validate_cert=False)
self.crawl(‘https://news.jin10.com/list.html?cate=7’, callback=self.index_page,save={‘type_str’:’7′,’name’:’外汇’},fetch_type=’js’,validate_cert=False)
self.crawl(‘https://news.jin10.com/list.html?cate=13’, callback=self.index_page,save={‘type_str’:’13’,’name’:’独家’},fetch_type=’js’,validate_cert=False)

def index_page(self, response):
type_str=response.save[‘type_str’]
name=response.save[‘name’]
index=0
imglist=[]
for item in response.doc(‘.jin-newsList__item img’).items():
imglist.append(item.attr.src)
for item in response.doc(‘.jin-newsList__item a’).items():
self.crawl(item.attr.href, callback=self.detail_page,save={‘img_url’: imglist[index],’type_str’:type_str,’name’:name},fetch_type=’js’,validate_cert=False)
index=index+1

def detail_page(self, response):
articid = response.url.split(‘=’)[-1]
imgurl=response.save[‘img_url’]
type_str=response.save[‘type_str’]
name=response.save[‘name’]
title=response.doc(‘.jin-newsList__title’).text()
desc=response.doc(‘.jin-news-article_description’).text()
content=response.doc(‘.jin-news-article_content’).text()
contenthtml=response.doc(‘.jin-news-article_content’).html()
author=”
time=”
i=0
for item in response.doc(‘.jin-news-article_h .jin-meta span’).items():
if i==2 :
time=item.text()
if i==3 :
author=item.text()
i=i+1

items={}
items[‘type’]=type_str
items[‘name’]=name
items[‘articid’]=int(articid)
items[‘title’]=title
items[‘abstract’]=desc
items[‘content’]=content
items[‘contenthtml’]=contenthtml
items[‘time’]=time
items[‘autor’]=author
items[‘imgurl’]=imgurl
items[‘detailurl’]=response.url
self.mysqldb.insert(items)

class MySQLDB:
def __init__(self):
print “init db”
self.conn = MySQLdb.connect(user = “root”, passwd = “123456”, db = “news”, host = “127.0.0.1”, charset = ‘utf8’, use_unicode = True)
self.cursor = self.conn.cursor()

def insert(self,item):
try:
self.cursor.execute(
“””INSERT IGNORE INTO jinshinews (type,name,articid,title,abstract,content,contenthtml,time,autor,imgurl,detailurl)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)”””,
(
item[‘type’],
item[‘name’],
item[‘articid’],
item[‘title’],
item[‘abstract’],
item[‘content’],
item[‘contenthtml’],
item[‘time’],
item[‘autor’],
item[‘imgurl’],
item[‘detailurl’]
)
)
self.conn.commit()
except MySQLdb.Error, e:
print ‘Error db %d %s’ % (e.args[0], e.args[1])

 

转载请注明:Coding » 利用pyspider抓取金十新闻资讯

喜欢 (0)

您必须 登录 才能发表评论!