我正试图为Indian patent search website编写一个webscraper来获取有关专利的数据。这是我目前掌握的密码。

#import the necessary modules
import urllib2
#import the beautifulsoup functions to parse the data
from bs4 import BeautifulSoup

#mention the website that you are trying to scrape

#Query the website and return the html to the variable 'page'
page = urllib2.urlopen(patentsite)

#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)

print soup

##                                                               ##
##                                                               ##
##           SIDDHAST.COM                                        ##
##                                                               ##
##                                                               ##
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<title>:: InPASS - Indian Patent Advanced Search System ::</title>
<link href="resources/ipats-all.css" rel="stylesheet"/>
<script src="app.js" type="text/javascript"></script>
<link href="resources/app.css" rel="stylesheet"/>




data = {
    "publication_type_published": "on",
    "publication_type_granted": "on",
    "fieldDate": "APD",
    "datefieldfrom": "19120101",
    "datefieldto": "20160906",
    "operatordate": " AND ",
    "field[]": ["PA"], # claims,.description, patent-number codes go here
    "fieldvalue[]": ["chris*"], # matching values for ^^ go here
    "operator[]": [" AND "], # matching sql logic for ^^ goes here
    "page": "1", #  gives you next page results
    "start": "0", # not sure what effect this actually has.
    "limit": "25"} # not sure how this relates as  len(r.json()[u'record']) stays 25 regardless

import requests
from time import time

post = "http://ipindiaservices.gov.in/publicsearch/resources/webservices/search.php?_dc={}".format(
    str(time()).replace(".", ""))

with requests.Session() as s:
    s.headers.update({"X-Requested-With": "XMLHttpRequest"})
    r = s.post(post, data=data)

{u'success': True, u'record': [{u'Publication_Status': u'Published', u'appDate': u'2016/06/16', u'pubDate': u'2016/08/31', u'title': u'ACTUATOR FOR DEPLOYABLE IMPLANT', u'sourceID': u'inpat', u'abstract': u'\n    Systems and methods are provided for usin.............

{u'Publication_Status': u'Published', u'appDate': u'2015/01/27', u'pubDate': u'2015/06/26', u'title': u'CORRUGATED PALLET', u'sourceID': u'inpat', u'abstract': u'\n    A corrugated paperboard pallet is produced from two flat blanks which comprise a pallet top and a pallet bottom. The two blanks are each folded to produce only two parallel vertically extending double thickness ribs&nbsp;three horizontal panels&nbsp;two vertical side walls and two horizontal flaps. The ribs of the pallet top and pallet bottom lock each other from opening in the center of the pallet by intersecting perpendicularly with notches in the ribs. The horizontal flaps lock the ribs from opening at the edges of the pallet by intersecting perpendicularly with notches&nbsp;and the vertical sidewalls include vertical flaps that open inward defining fork passages whereby the vertical flaps lock said horizontal flaps from opening.\n  ', u'Assignee': u'OLVEY Douglas A., SKETO James L., GUMBERT Sean G., DANKO Joseph J., GABRYS Christopher W., ', u'field_of_invention': u'FI10', u'publication_no': u'26/2015', u'patent_no': u'', u'application_no': u'642/DELNP/2015', u'UCID': u'WVJ4NVVIYzFLcUQvVnJsZGczcVRmSS96Vkh3NWsrS1h3Qk43S2xHczJ2WT0%3D', u'Publication_Type': u'A'}

python - 在印度专利网站上抓取专利数据的网站-LMLPHP
python - 在印度专利网站上抓取专利数据的网站-LMLPHP

08-18 09:44