• "date": "2020-03-30"
  • "title": "入门级网页爬虫"

环境:Pycharm 2019.3 + Anaconda3 2020.02

代码

import urllib.request
import time
import re
import os


def save(page_id, title, en_title, college, info, source):
   with open('2017210213.txt', 'a') as f1:
       try:
           info = info.encode('gbk').decode('gbk')
       except UnicodeEncodeError:
           info = 'gbk无法编码'
       key = 'id:{}\ntitle:{}\nen_title:{}\ncollege:{}\ninfo:{}\nsource:{}\n\n' \
          .format(page_id, title, en_title, college, info, source)
       key_gbk = key.encode('gbk').decode('gbk')
       f1.write(key_gbk)
       print(page_id, 'save')


def get_page(page_id):
   opener = urllib.request.build_opener()
   opener.add_headers = [
      ("accept-encoding", "identity;q=1,*;q=0"),
      ("accept-language", "zh-CN,zh;q=0.9"),
      ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')
  ]
   urllib.request.install_opener(opener)
   url = 'https://win.bupt.edu.cn/program.do?id='+str(page_id)
   response = urllib.request.urlopen(url)
   time.sleep(3)
   text = response.read().decode('utf-8')
   print(page_id, 'got')
   return text


def find(page_id, text):
   reg = r'<h2 style="display:inline">(.*?)</h2>.*?' \
         r'<h3 style="display:inline;">&nbsp;(.*?)</h3>.*?' \
         r'<div style="font-size:17px;line-height:25px;">.*?' \
         r'<div style="margin-top:-7px;overflow: hidden;white-space: nowrap;text-overflow: ellipsis;">(.*?)</div>.*?' \
         r'<div style="font-size:17px;line-height:25px;">(.*?)</div>'
   pattern = re.compile(reg, re.S)
   key = pattern.findall(text)
   pattern_s = re.compile(r'var eval_score.*?name":"(.*?)\\u8bad\\u7ec3\\u8ba1\\u5212', re.S)
   s = pattern_s.findall(text)
   if key:
       title = key[0][0].strip()
       en_title = key[0][1].strip()
       college = key[0][2].strip()
       info = key[0][3].replace('\n', ' ').replace('\r', ' ').strip()
       source = s[0].strip().encode('utf-8').decode('unicode_escape')
       save(page_id, title, en_title, college, info, source)
   else:
       print(page_id, 'not exist')
       return page_id


if os.path.exists('2017210213.txt'):
   with open('2017210213.txt', 'r')as f:
       lines = f.readlines()
       last = lines[-6]
       last_id = re.findall(r'\d+', last)[0]
       print('begin from', last_id)
else:
   last_id = 1200

for i in range(int(last_id)+1, 1414):
   page = get_page(i)
   find(i, page)

后记

这个作业因为在写入txt文件时,多了一个换行符被扣了10分,black同学告诉我群里说过,我太阳痿了。