Scrapbook¶
Scrapbook is simple scraping library.
from scrapbook import Element, Content
import requests
class Twitter(Content):
username = Element(
xpath='//*[@id="page-container"]/div[2]/div/div'
'/div[1]/div/div/div/div[1]/h2/a/span/b/text()',
)
screen_name = Element(
xpath='//*[@id="page-container"]/div[2]/div/div/'
'div[1]/div/div/div/div[1]/h1/a/text()',
)
response = requests.get('https://twitter.com/odoku')
data = Twitter().parse(response.text)
print(data)
Requirements¶
- Python 2.7 or Python 3.3+
Installation¶
pip install scrapbook
Page¶
Element¶
Get the element specified by xpath from HTML.
from scrapbook import Element
import requests
response = requests.get('https://twitter.com/odoku')
screen_name = Element(
xpath='//*[@id="page-container"]/div[2]/div/div'
'/div[1]/div/div/div/div[1]/h2/a/span/b/text()',
)
name = screen_name.parse(response.text)
print(name)
Arguments¶
Element(
xpath: Optional[str] = None,
filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
parser: Union[Callable, str] = scrapbook.parsers.First(),
)
xpath¶
Specify the xpath of the element you want to retrieve.
el = Element(xpath='/html/body/p/text()')
texts = el.parse()
filter¶
Any processing can be performed on the acquired value.
def clean(values):
return [v.strip() for v values]
el = Element(xpath='/html/body/p/text()', filter=clean)
More than one filter can be specified.
def to_int(value):
return [int(v.strip()) for v values]
Element(xpath='/html/body/p/text()', filter=[to_int, sum])
parser¶
You can specify a function to parse the element specified by xpath.
def parse_link(selector):
# selector is parsel.SelectorList
return {
'url': selector.xpath('./@href').extract_first(),
'text': selector.xpath('./text()').extract_first(),
}
Element(xpath='/html/body/a', parser=parse_link)
Content¶
You can handle multiple Elements at once.
from scrapbook import Element, Content
import requests
class Twitter(Content):
username = Element(
xpath='//*[@id="page-container"]/div[2]/div/div'
'/div[1]/div/div/div/div[1]/h2/a/span/b/text()',
)
screen_name = Element(
xpath='//*[@id="page-container"]/div[2]/div/div/'
'div[1]/div/div/div/div[1]/h1/a',
)
response = requests.get('https://twitter.com/odoku')
data = Twitter().parse(response.text)
print(data)
Include filter/parser functions¶
You can define the filter / parser specified in the Element in the Content.
class Page(Content):
username = Element(
xpath='//*[@id="username"]',
parse='parse_username',
filter='filter_username',
)
def parse_username(self, selector):
return selector.xpath('./text()').extract_first()
def filter_username(self, value):
return value.replace('username: ', '').strip()
Nest¶
Content can be nested.
class Profile(Content):
username = Element(xpath='./path/to/username/text()')
screen_name = Element(xpath='./path/to/screen_name/text()')
class Page(Content):
profile = Profile(xpath='//*[@id="profile"]')
Inheritance¶
Content supports inheritance.
class Common(Content):
title = Element(xpath='/path/to/title/text()')
class ProjectPage(Common):
name = Element(xpath='/path/to/name/text()')
class TeamPage(Common):
name = Element(xpath='/path/to/name/text()')
Arguments¶
Content(
xpath: Optional[str] = None,
filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
many: bool = False,
)
xpath¶
Specify the xpath of the element you want to parse. For the included Element, the element of the specified xpath is passed.
class Page(Content):
username = Element(xpath='./span[1]/text()')
page = Page(xpath='//*[@id="profile"]')
data = page.parse(html)
filter¶
You can do arbitrary processing on the acquired value. As with Element, multiple filters can be specified.
class Page(Content):
username = Element(xpath='./span[1]/text()')
def rename(value):
alias = {'username': 'account'}
return {alias.get(k, k): v for k, v in value.items()}
page = Page(xpath='//*[@id="profile"]', filter=rename)
data = page.parse(html)
many¶
If there are multiple elements specified by xpath, you can get it as a list by specifying many = True.
class Comemnt(Content):
text = Element(xpath='./text()')
class Article(Content):
title = Element(xpath='//*[@id="title"]')
content = Element(xpath='//*[@id="content"]')
comments = Comment(xpath='//*[@id="content-list"]/li', many=True)
article = Article()
data = article.parse(html)
Methods¶
parse¶
parse(
html: Union[str, parsel.Selector, parsel.SelectorList],
object: Optional[Any],
)
Parse html.
class Page(Content):
content = Element(xpath='/html/body/p/text()')
html = '<html><body><p>Hello!</p></body></html>'
page = Page()
data = page.parse(html) # {'content': 'Hello!'}
Map the value to the object specified in the object argument.
instance = PageModel()
page = Page()
instance = page.parse(html, object=instance)
Class Methods¶
inline¶
inline(
xpath: str = None,
filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
**attrs: Dict[str, Any]
)
Returns an instance of dynamically generated Content class.
class Page(Content):
content = Content.inline(
text=Element(xpath='/html/body/p/text()', filter='twice'),
)
def twice(self, value):
return value * 2
html = '<html><body><p>Hello!</p></body></html>'
page = Page()
data = page.parse(html) # {'content': {'text': 'Hello!Hello!'}}
Filters¶
By using various filters for Element or Content, you can set the retrieved value to your preferred format.
el = Element(
xpath='//html/body/ul/li',
filter=[
Map(
clean_text,
Normalize(),
Fetch(r'(?P<key>.+): (?P<count>\d+)'),
),
lambda values: {v['key']: v['count'] for v in values},
],
)
Map¶
Execute the filter specified by argument for each element of list or dict.
filter = Map(clean_text, Equals('yes'))
result = filter({
'AAA': ' no ',
'BBB': ' yes ',
'CCC': ' <strong> yes <strong> ',
})
assert {
'AAA': False,
'BBB': True,
'CCC': True,
} == result
It is also possible to call functions defined in the Content class.
class Page(Content):
links = Element(xpath='//a/@href', parser=All(), filter=Map('filter_link'))
def filter_link(self, value):
url = urlparse(value)
return url.netloc
page = Page(xpath='')
result = page.parse('''
<a href="http://google.com">Google</a>
<a href="http://twitter.com">Twitter</a>
<a href="http://facebook.com">Facebook</a>
''')
assert {
'links': [
'google.com',
'twitter.com',
'facebook.com',
]
} == result
Through¶
It returns the passed value as it is. This is the default filter for Element / Content.
assert 10 == through(10)
TakeFirst¶
Get the first element of list. However, if the acquired element is None or ‘’, the next element is acquired.
assert 10 == take_first([None, '', 10])
CleanText¶
Perform the following cleaning process on the character string.
- Removing HTML tags
- Decode HTML special characters
- Make 2 spaces or more of one contiguous space
- Remove Whitespace before and after
clean_text = CleanText()
assert 'aaa & bbb' == clean_text('<p> aaa & bbb </p>')
You can specify how to handle empty values.
clean_text = CleanText(empty_value='empty')
assert 'empty' == clean_text('')
You can also replace the line feed code with a space.
clean_text = CleanText(remove_line_breaks=True)
assert 'a b' == clean_text('a\nb')
Equals¶
Returns True if the value matches the specified string.
equals = Equals('yes')
assert equals('yes')
Contains¶
Returns True if the specified character string is included in the character string.
contains = Contains('B')
assert contains('ABC')
Fetch¶
Extract values from strings using regular expressions.
fetch = Fetch(r'\d+')
assert '100' == fetch('Price: $100')
You can also get all matched values.
fetch = Fetch(r'\d+', all=True)
assert ['100', '20'] == fetch('Price: $100, Amount: 20')
It can also be returned as dict by specifying label.
fetch = Fetch(r'Price: $(?P<price>\d+), Amount: (?P<amount>\d+)')
assert {'price': '100', 'amount': '20'} == fetch('Price: $100, Amount: 20')
Replace¶
You can replace the string using regular expressions.
replace = Replace(r'A+', 'A')
assert 'ABC' == replace('AAAAABC')
Join¶
Returns a string formed by combining list with separator.
join = Join(',')
assert 'A,B,C' == join(['A', 'B', 'C'])
Normalize¶
Returns the normalized string.
normalize = Normalize()
assert '12AB&%' == normalize('12AB&%')
RenameKey¶
Rename the dict’s key.
rename_key = RenameKey({'AAA': 'BBB'})
assert {'BBB': 10} == rename_key({'AAA': 10})
FilterDict¶
Returns dict with only the specified key.
filter_dict = FilterDict(['AAA', 'BBB'])
assert {'AAA': 10, 'BBB': 20} == filter_dict({'AAA': 10, 'BBB': 20, 'CCC': 30})
Other than the specified key can be returned.
filter_dict = FilterDict(['AAA', 'BBB'], ignore=True)
assert {'CCC': 30} == filter_dict({'AAA': 10, 'BBB': 20, 'CCC': 30})
Partial¶
You can execute it by specifying partial arguments to the function.
def add(a, b, c):
return a + b + c
result = Partial(add, kwargs={'a': 10, 'c': 30}, arg_name='b')(20)
assert 60 == result
The Partial filter handles empty values safely, so it is convenient to use it as a wrapper for functions.
assert Partial(int)('') is None
assert Partial(int)('10') == 10
DateTime¶
Converts a Datetime String to a Datetime object.
parse_dt = DateTime()
assert datetime(2001, 2, 3, 4, 5, 6) == parse_dt('2001-02-03 04:05:06')
You can also handle timezone.
parse_dt = DateTime()
result = parse_dt('2001-02-03T04:05:06+09:00')
assert datetime(2001, 2, 3, 4, 5, 6, 0, tzoffset(None, 3600 * 9)) == result
Unnecessary information can be truncated.
parse_dt = DateTime(truncate_timezone=True)
result = parse_dt('2001-02-03T04:05:06+09:00')
assert datetime(2001, 2, 3, 4, 5, 6) == result
parse_dt = DateTime(truncate_time=True)
result = parse_dt('2001-02-03T04:05:06+09:00')
assert date(2001, 2, 3) == result
You can also specify the format.
parse_dt = DateTime(format='%d %m %Y')
result = parse_dt('01 02 2003')
assert datetime(2003, 2, 1) == result
Bool¶
Convert string to Bool type.
parse_bool_string = Bool()
assert parse_bool_string('true')
You can specify a string to treat as True.
parse_bool_string = Bool('OK', 'ok')
assert parse_bool_string('OK')
Parsers¶
First¶
Get only one element matching the specified xpath. This is the default parser of Element.
<html>
<body>
<p>
AAA
<br>
BBB
<br>
CCCC
</p>
</body>
</html>
el = Element(xpath='//html/body/p/text()', parser=First)
text = el.parse(html)
assert 'AAA' == text
All¶
It converts all elements matching xpath into text and returns it as list.
<html>
<body>
<p>
AAA
<br>
BBB
<br>
CCCC
</p>
</body>
</html>
el = Element(xpath='//html/body/p/text()', parser=All)
texts = el.parse(html)
assert ['AAA', 'BBB', 'CCC'] == texts
ParseTable¶
Parse basic table and return it as list.
<html>
<body>
<table>
<tr>
<th>Company</th>
<th>Contact</th>
<th>Country</th>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td>Maria Anders</td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td>Mexico</td>
</tr>
</table>
</body>
</html>
el = Element(xpath='//html/body/table', parser=ParseTable())
data = el.parse(html)
assert [
['Alfreds Futterkiste', 'Maria Anders', 'Germany'],
['Centro comercial Moctezuma', 'Francisco Chang', 'Mexico'],
] == data
If there is a header in table, passing has_header = True will return dict with the value of header as key.
el = Element(xpath='//html/body/table', parser=ParseTable(has_header=True))
data = el.parse(html)
assert [
{
'Company': 'Alfreds Futterkiste',
'Contact': 'Maria Anders',
'Country': 'Germany',
},
{
'Company': 'Centro comercial Moctezuma',
'Contact': 'Francisco Chang',
'Country': 'Mexico',
},
] == data
ParseList¶
Parse elements such as <ul> and <ol> and return them as list.
<html>
<body>
<ol>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ol>
</body>
</html>
el = Element(xpath='//html/body/ol', parser=ParseList())
data = el.parse(html)
assert ['Coffee', 'Tea', 'Milk'] == data
ParseDefinitionList¶
It parses <dl> and returns it as dict.
<html>
<body>
<dl>
<dt>Coffee</dt>
<dd>black hot drink</dd>
<dt>Milk</dt>
<dd>white cold drink</dd>
<dd>white hot drink</dd>
</dl>
</body>
</html>
el = Element(xpath='//html/body/dl', parser=ParseDefinitionList())
data = el.parse(html)
assert {
'Coffee': 'black hot drink',
'Milk': [
'white cold drink',
'white hot drink',
]
} = data