爬虫 - qunxingw的博客

作者在 2018-08-25 19:24:45 发布以下内容
# _*_ coding:utf-8_*_
import requests
from bs4 import BeautifulSoup
import re

def download_page(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    r = requests.get(url, headers=headers)
    return r.content

def get_content(html_text):


    soup =BeautifulSoup(html_text, 'html.parser')
    a_lis=soup.find_all('a', href=re.compile(r'^/song/'))#列表
    sname=dict()#字典
    for  s in a_lis:
           
           a_name=s.string
         
           ass_href=''.join(['http://music.taihe.com',s['href']])
           sname[a_name]=ass_href
           print('歌名：{}  =>链接:{}\n'.format( a_name,ass_href))
    print(sname)
    return sname
    
def output_html(sname):
        fout = open('music.html', 'w')
        
        fout.write("<html>")
        fout.write("<body>")
        fout.write("<table border='1'>")
        for data in sname:
            fout.write("<tr>")
            #fout.write("<td>%s</td>" % data)
            fout.write("<td>{}</td>".format(data))
            fout.write("<td>{}</td>".format(sname[data]))  
            fout.write("<td><a href={}>去听一下</a></td>".format(sname[data]))
            fout.write("</tr>")

        fout.write("</table>")
        fout.write("</body>")
        fout.write("</html>")
        fout.close()

def main():
    urls=['http://music.taihe.com/']#可添加链接
    for url in urls:
        html_text = download_page(url)
        sname= get_content(html_text)
        output_html(sname)
if __name__ == '__main__':
    main()