HOME/Articles/

pil example crawl mp (snippet)

Article Outline

Python pil example 'crawl mp'

Functions in program:

  • def crawl_person(link, name):
  • def crawl_party(link):
  • def main():

Modules used in program:

  • import requests as rq

python crawl mp

Python pil example: crawl mp

# Get name and image of mps from sobranie.mk and put them in `PATH`

import requests as rq
from lxml.html import fromstring

PATH = 'target'
base = 'http://www.sobranie.mk/'
start = base+'?ItemID=C5223B907BD9D247A9245C0C30C2E6AE'

def main():
    q = rq.get(start)
    doc = fromstring(q.content)
    links = doc.cssselect("table.DetalnoTabelaTopHome a.WB_SOBRANIE_TocItem[href]")
    for l in links:
        crawl_party(base+ l.attrib['href'])

def crawl_party(link):
    #print('===============')
    q = rq.get(link)
    doc = fromstring(q.content)
    links = doc.cssselect("table.DetalnoTabelaTopHome a.WB_SOBRANIE_ArticleTitle[href]")
    for l in links:
        crawl_person(base + l.attrib['href'], l.text_content() )

def crawl_person(link, name):
    name = name.strip()
    if name=='': return
    q = rq.get(link)
    doc = fromstring(q.content)

    imgs = doc.cssselect("table.DetalnoTabelaTop table p img[src]")

    if len(imgs) != 1:
        print('Problem with ' + name.encode('utf-8'))
        return
    q = rq.get(base + imgs[0].attrib['src'])
    print(name.encode('utf-8'))
    with open(PATH + '/' +name+'.jpg', 'wb') as f:
        f.write( q.content)

if __name__=='__main__':
    main()