爬取某网站写的python代码-创新互联
代码如下:
创新互联公司坚持“要么做到,要么别承诺”的工作理念,服务领域包括:成都网站设计、网站建设、企业官网、英文网站、手机端网站、网站推广等服务,满足客户于互联网时代的盐津网站设计、移动媒体设计的需求,帮助企业找到有效的互联网解决方案。努力成为您成熟可靠的网络建设合作伙伴!import requests from pyquery import PyQuery import re import os import csv import datetime """ 说明:该代码是专门为爬取http://www.kgtmall.com.cn/商品而设计的。 使用方法: 1、在本地提前安装好python3的环境; 2、直接运行本代码; 3、运行本代码完后,会在当前目录生成一个result.csv文件,该文件里面就存了爬取该站点的商品信息 注意事项:在本代码运行期间,不能打开result.csv文件,因为这样程序就写不进去数据了;只能等本代码 全部运行结束后,才能打开esult.csv文件进行查看。 """ def get_html_text(url): """ 获取首页源代码 :param url: :return: """ r = requests.get(url) return r.text def get_one_level_class(home_url): """ 一级标题 母婴用品 http://www.kgtmall.com.cn/mall/list.php?catid=4 生活家居 http://www.kgtmall.com.cn/mall/list.php?catid=5 """ html = get_html_text(home_url) jpy = PyQuery(html) items = jpy('.menu_title a') for line in items: jpy = PyQuery(line) one_level_url = jpy('a').attr('href') one_level_title = jpy('a').text() yield one_level_url, one_level_title def get_two_level_class(home_url): """ 二级标题 母婴用品 营养辅食 http://www.kgtmall.com.cn/mall/search.php?catid=539 母婴用品 妈妈专区 http://www.kgtmall.com.cn/mall/search.php?catid=544 母婴用品 婴儿保健 http://www.kgtmall.com.cn/mall/search.php?catid=887 """ for one_level_url, one_level_title in get_one_level_class(home_url): jpy = PyQuery(one_level_url) items = jpy('.selector_category li') for line in items: jpy = PyQuery(line) two_level_url = jpy('a').attr('href') two_level_title = jpy('a').text() yield one_level_title, two_level_title, two_level_url def get_pages(url): """ 获取页数 :return: """ jpy = PyQuery(url) pages = jpy('.pagination cite').text() print('原pages:', pages) try: pages = int(re.findall('共.*?条/(.*)页', pages)[0]) except Exception as e: print(e) pages = 1 print('页码:', pages) return pages def get_three_level_class(home_url): """ 三级标题 母婴用品 营养辅食 DHA http://www.kgtmall.com.cn/mall/search.php?catid=548 母婴用品 营养辅食 益生菌/初乳 http://www.kgtmall.com.cn/mall/search.php?catid=549 母婴用品 营养辅食 清火/开胃/驱虫 http://www.kgtmall.com.cn/mall/search.php?catid=550 """ for one_level_title, two_level_title, two_level_url in get_two_level_class(home_url): jpy = PyQuery(two_level_url) items = jpy('.selector_category li') for line in items: jpy = PyQuery(line) three_level_title = jpy('a').text() three_level_url = jpy('a').attr('href') catid = re.findall('http://www.kgtmall.com.cn/mall/search.php\?catid=(.*)', three_level_url)[0] pages = get_pages(three_level_url) # for index in range(1, 3): for index in range(1, pages + 1): three_level_url_by_xiaoliang = 'http://www.kgtmall.com.cn/mall/search.php?kw=&list=0&catid={}&order=10&minprice=&maxprice=&page={}'.format( catid, index) yield one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang def shop_title_and_url(home_url): """ 商品标题和url 母婴用品 营养辅食 DHA 澳洲直邮 澳大利亚RIFOLD 儿童DHA90粒(一月以上适用) http://www.kgtmall.com.cn/mall/show.php?itemid=28089 母婴用品 营养辅食 益生菌/初乳 澳大利亚 Maxigenes美可卓 全脂高钙奶粉(蓝胖子)1kg 两罐装 http://www.kgtmall.com.cn/mall/show.php?itemid=23486 """ for one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang in get_three_level_class(home_url): jpy = PyQuery(three_level_url_by_xiaoliang) items = jpy('.list_img a') for line in items: jpy = PyQuery(line) shop_url = jpy('a').attr('href') shop_title = jpy('a img').attr('alt') yield one_level_title, two_level_title, three_level_title, shop_title, shop_url def get_shop_info(home_url, count): for one_level_title, two_level_title, three_level_title, shop_title, shop_url in shop_title_and_url(home_url): print('--排错:' + one_level_title, two_level_title, three_level_title, shop_title, shop_url) jpy = PyQuery(shop_url) price = jpy('.price').text() # 条形码 bar_code = jpy('.bar_code dl dd p').text() goods_detail = jpy('#content') try: guige = re.findall('规格:(.*)', goods_detail.text())[0] except: guige = '没有规格' try: chandi = re.findall('产地:(.*)', goods_detail.text())[0] except: chandi = '没有产地' print(count, one_level_title, two_level_title, three_level_title, shop_title, bar_code, chandi, guige, price, shop_url) row = ([one_level_title, two_level_title, three_level_title, shop_title, bar_code, chandi, guige, price, shop_url]) ppath = os.path.dirname(__file__) csv_file = ppath + '/result.csv' # newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错 with open(csv_file, 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(row) count += 1 def main(): # 记录一下开始时间 start_time = datetime.datetime.now() home_url = 'http://www.kgtmall.com.cn/' # 当前代码路径 ppath = os.path.dirname(__file__) csv_file = ppath + '/result.csv' headers = (['一级分类', '二级分类', '三级分类', '商品名称', '条码', '产地', '规格', '价格', '商品链接']) # newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错 with open(csv_file, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(headers) count = 1 get_shop_info(home_url, 1) # 记录一下结束时间 end_time = datetime.datetime.now() # 记录程序执行用时 timediff = end_time - start_time print('总共用时{}秒\n'.format(str(timediff.seconds))) print('全部商品已经按需求完成!!!') if __name__ == '__main__': main()
运行后,会在当前目录下生成个result.csv文件,内容如下:
当前标题:爬取某网站写的python代码-创新互联
网站链接:http://pcwzsj.com/article/csjsej.html