# -*- coding:utf-8 -*- """ 财新网新闻数据检索下载 Created on 2017/06/09 @author: Yuan Yifan @group : ~ @contact: tsingjyujing@163.com """ """ # 测试脚本 from caixinnews import * urls = query_news(start_date='2017-05-09',end_date='2017-05-09') title,text = read_page(urls[0]) print(title) print(text) """ import re import datetime from bs4 import BeautifulSoup try: from urllib.request import urlopen, Request except ImportError: from urllib2 import urlopen, Request caixin_search_url = "http://search.caixin.com/search/search.jsp?startDate=%s&endDate=%s&keyword=%s&x=0&y=0" default_parser = "html.parser" #default_parser = "lxml" UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "+\ "AppleWebKit/537.36 (KHTML, like Gecko) "+\ "Chrome/42.0.2311.135 "+\ "Safari/537.36 "+\ "Edge/12.10240" req_header = {\ 'User-Agent': UA,\ 'Accept': '"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"',\ 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'} req_timeout = 10 def read_url(url): """ 读取URL对应的内容(模拟浏览器) Parameters ------ url string 需要读取的链接 Return ------ string 读取的内容 """ req_header_this = req_header req_header_this['Host'] = re.findall('://.*?/',url,re.DOTALL)[0][3:-1] return urlopen(Request(url,None,req_header),None,req_timeout).read() def get_soup(url): """ 读取URL对应的内容,并解析为Soup Parameters url string 需要读取的链接 Return string 读取的内容 """ return BeautifulSoup(read_url(url), default_parser) def query_news(keywords='*',start_date=None,end_date=None): """ 读取某一时间段对应的新闻链接列表 Parameters ------ keywords string 关键词 start_date string 开始日期,格式yyyy-mm-dd end_date string 结束日期,格式yyyy-mm-dd Return ------ List 读取的内容 """ if start_date is None or end_date is None: now_time = datetime.datetime.now() last_day = datetime.datetime(now_time.year,now_time.month,now_time.day,12,0) - datetime.timedelta(seconds = 3600*24) start_date = last_day.strftime("%Y-%m-%d") end_date = start_date url = caixin_search_url % (start_date,end_date,keywords) soup = get_soup(url) info_urls = [] while(True): next_info = soup.find_all(name='a',attrs={'class','pageNavBtn2'})[0] all_res = soup.find_all(name='div',attrs={'class','searchxt'}) for res in all_res: info_urls.append(res.a.attrs['href']) next_info = next_info.attrs['href'] if next_info=="javascript:void();": break; else: soup = get_soup(caixin_search_url+next_info) return info_urls def is_blog(url): """ 判断某一链接是否博客 Parameters ------ url string 需要判断的链接 Return ------ bool 该url是否是博客URL """ return len(re.findall('blog\.caixin\.com',url))>0 def read_page(url): """ 读取链接的内容 Parameters ------ url string 需要判断的链接 Return ------ title string 文章标题 text string 文章内容 """ if is_blog(url): return read_blog(url) else: return read_normal_artical(url) def read_normal_artical(url): soup = get_soup(url) title = soup.title.get_text() ps = soup.find_all('p') text = '' for p in ps: text += p.get_text() + "\n" return title,text def read_blog(url): soup = get_soup(url) title = soup.title.get_text() bcontent = soup.find_all(name='div',attrs={'class','blog_content'}) ps = bcontent[0].find_all('p') text = '' for p in ps: text += p.get_text() + "\n" return title,text