uds-se / debuggingbook Goto Github PK
View Code? Open in Web Editor NEWProject page for "The Debugging Book"
Home Page: https://www.debuggingbook.org/
License: Other
Project page for "The Debugging Book"
Home Page: https://www.debuggingbook.org/
License: Other
Really enjoying and learning a lot from the book so far!
In docs/beta/html/Tours.html & docs/html/Tours.html on line 12339, the tracing link should point to Tracer.html instead of Tracing.html.
For my work on learning symbolic inputs, I wrote a variant of delta debugging which reduces parse trees based on an existing grammar. It works quite well and reasonably fast so far. I do not know whether it is relevant for the Debugging Book, and the code is quite lengthy and not didactically optimized, but thought I'd share it here anyway.
After the below code implementing the reduction, I'll also post an example processing CSV.
import itertools
import logging
import re
from fuzzingbook.GrammarFuzzer import tree_to_string
from fuzzingbook.Grammars import RE_NONTERMINAL, is_nonterminal
from fuzzingbook.Parser import EarleyParser
class Reducer:
def __init__(self, grammar,
concrete_input,
test,
expected_exc_type=Exception,
max_tries_for_generalization=50,
max_tries_for_validation=200,
max_retries_after_failed_validation=3):
self.grammar = grammar
self.test = test
self.max_tries_for_generalization = max_tries_for_generalization
self.max_tries_for_validation = max_tries_for_validation
self.max_retries_after_failed_validation = max_retries_after_failed_validation
self.expected_exc_type = expected_exc_type
self.logger = logging.getLogger("Reducer")
self.concrete_input = concrete_input
if max_tries_for_validation < max_tries_for_generalization:
self.max_tries_for_validation = 2 * max_tries_for_generalization
def reduce(self, concrete_input=None):
"""Implements the idea of Delta Debugging for input trees."""
self.logger.info("PHASE START: Reduction à la Delta Debugging.")
if concrete_input is None:
concrete_input = self.concrete_input
parse_tree = list(EarleyParser(self.grammar).parse(concrete_input))[0]
did_generalize = True
while did_generalize:
paths = [[0]]
did_generalize = False
while paths:
path = paths.pop()
node, children = Reducer.get_subtree(parse_tree, path)
tree_for_log = tree_to_string((node, children))
compatible_subtree_paths = []
for i, child in enumerate(children):
compatible_paths = self.find_paths(child, lambda p, t: t[0] == node)
if compatible_paths:
compatible_subtree_paths.extend(list(map(lambda p: path + [i] + p, compatible_paths)))
did_replace = False
# Step 1: Try to replace this subtree by one of its subtree
for compatible_path in compatible_subtree_paths:
subtree = Reducer.get_subtree(parse_tree, compatible_path)
maybe_new_input = Reducer.replace_subtree(parse_tree, subtree, path)
if self.test_input(tree_to_string(maybe_new_input)):
self.logger.info(f"Generalizing path {path}, "
f"replacing {tree_to_string((node, children))} "
f"with {tree_to_string(subtree)}")
parse_tree = maybe_new_input
self.logger.info(f"Current input: {tree_to_string(parse_tree)}")
did_replace, did_generalize = True, True
break
# Step 2: Try to choose another expansion in the grammar to replace the
# children set by just one child or fewer children.
if not did_replace:
expansions = [list(filter(lambda l: l, re.split(RE_NONTERMINAL, e)))
for e in self.grammar.get(node, [])]
current_expansion = [node for node in [c[0] for c in children]]
for elements in [e for e in expansions if e != current_expansion]:
# Find matching subtrees for each nonterminal
nonterminals = [e for e in elements if is_nonterminal(e)]
compatible_subtree_paths = [None] * len(nonterminals)
for k, nonterminal in enumerate(nonterminals):
compatible_subtree_paths[k] = []
for i, child in enumerate(children):
compatible_paths = self.find_paths(child, lambda p, t: t[0] == nonterminal)
if compatible_paths:
compatible_subtree_paths[k].extend(
list(map(lambda p: path + [i] + p, compatible_paths)))
# What itertools.product does:
# [[1, 2], [3, 4, 5]] -> [[1, 3], [1, 4], [1, 5], [2, 3], ...]
# Try all instantiations
for combination in itertools.product(*compatible_subtree_paths):
i = 0
new_children = []
for element in elements:
if not is_nonterminal(element):
if type(element) is not tuple:
element = (element, [])
new_children.append(element)
else:
new_children.append(Reducer.get_subtree(parse_tree, combination[i]))
i += 1
subtree = (node, new_children)
if Reducer.count_nodes((node, children)) <= Reducer.count_nodes((node, new_children)):
continue
maybe_new_input = Reducer.replace_subtree(parse_tree, subtree, path)
if self.test_input(tree_to_string(maybe_new_input)):
self.logger.info(f"Generalizing path {path}, "
f"replacing {tree_to_string((node, children))} "
f"with {tree_to_string(subtree)}")
parse_tree = maybe_new_input
self.logger.info(f"Current input: {str(parse_tree)}")
did_replace, did_generalize = True, True
break
if not did_replace:
self.logger.info(f"Cannot generalize path {path}, expression {tree_for_log}")
if children is not None:
for i, child in enumerate(children):
child_symbol, _ = child
if child_symbol in self.grammar:
paths.append(path + [i])
self.logger.info("PHASE END: Reduction à la Delta Debugging.")
self.concrete_input = tree_to_string(parse_tree)
return self.concrete_input
def test_input(self, concrete_input):
self.logger.debug(f"Testing {repr(concrete_input)}...")
try:
self.test(concrete_input)
except Exception as exc:
if issubclass(type(exc), self.expected_exc_type):
self.logger.debug(f"FAIL ({type(exc).__name__})")
return True
else:
self.logger.info(f"UNEXPECTED FAIL ({type(exc).__name__})")
return False
else:
self.logger.debug(f"PASS")
return False
def find_paths(self, tree, predicate, path=None):
"""
Return a list of all paths for which `predicate` holds.
`predicate` is a function `predicate`(`path`, `tree`), where
`path` denotes a subtree in `tree`. If `predicate()` returns
True, `path` is included in the returned list.
Taken from the Debugging Book.
"""
if path is None:
path = []
symbol, children = Reducer.get_subtree(tree, path)
if predicate(path, (symbol, children)):
return [path]
paths = []
if children is not None:
for i, child in enumerate(children):
child_symbol, _ = child
if child_symbol in self.grammar:
paths += self.find_paths(tree, predicate, path + [i])
return paths
@staticmethod
def get_subtree(tree, path):
"""Access a subtree based on `path` (a list of children numbers)"""
node, children = tree
if not path:
return tree
return Reducer.get_subtree(children[path[0]], path[1:])
@staticmethod
def replace_subtree(tree, replacement_tree, path):
"""Returns a symbolic input with a new tree where replacement_tree has been inserted at `path`"""
def recurse(tree, path):
node, children = tree
if not path:
return replacement_tree
head = path[0]
new_children = (children[:head] +
[recurse(children[head], path[1:])] +
children[head + 1:])
return node, new_children
return recurse(tree, path)
@staticmethod
def dfs(tree, action=print):
node, children = tree
action(tree)
for child in children:
Reducer.dfs(child, action)
@staticmethod
def count_nodes(parse_tree):
num = 0
def count(t):
nonlocal num
num += 1
Reducer.dfs(parse_tree, count)
return num
The method read_csv_file
reads a CSV file into a custom data structure. If one line has more fields than the header, an IndexError is thrown.
CSV_EBNF_GRAMMAR = {
"<start>": ["<csv-file>"],
"<csv-file>": ["<csv-record>*"],
"<csv-record>": ["<csv-string-list>\n"],
"<csv-string-list>": ["<raw-string>", "<raw-string>;<csv-string-list>"],
"<raw-string>": ["<spaces>", "<spaces>?<raw-field><spaces>?"],
"<raw-field>": ["<simple-field>", "<quoted-field>"],
"<simple-field>": ["<simple-character>*"],
"<simple-character>": [c for c in srange(string.printable) if c not in ["\n", ";", '"', " ", "\t", "\r"]],
"<quoted-field>": ['"<escaped-field>"'],
"<escaped-field>": ["<escaped-character>*"],
"<escaped-character>": [c for c in srange(string.printable) if c not in ['"']],
"<spaces>": [" ", " <spaces>"],
}
CSV_GRAMMAR = convert_ebnf_grammar(CSV_EBNF_GRAMMAR)
CSV_PARSER = EarleyParser(CSV_GRAMMAR)
class CSVFile:
def __init__(self, header: list, lines: list[list]):
self.header = header
self.lines = []
for line in lines:
the_dict = {}
for i, entry in enumerate(line):
the_dict[header[i]] = entry
self.lines.append(the_dict)
def __getitem__(self, item):
if type(item) is str:
return [line[item] for line in self.lines]
else:
return self.lines[item]
def __iter__(self):
return self.lines.__iter__()
def __str__(self):
result = " ; ".join(self.header)
if self.lines:
result += "\n"
result += "\n".join([" ; ".join(line.values()) for line in self.lines])
return result
def read_csv_file(inp):
tree = list(CSV_PARSER.parse(inp))[0]
fields = []
records = []
def collect_records(t):
nonlocal fields, records
node, children = t
if node == "<csv-record>":
record = []
for child in children:
fields = []
Reducer.dfs(child, collect_fields)
if fields:
record.append(fields)
records.extend(record)
def collect_fields(t):
nonlocal fields
node, children = t
if node == "<csv-string-list>":
fields.append(tree_to_string(children[0]))
Reducer.dfs(tree, collect_records)
if len(records) < 1:
raise SyntaxError("CSV file must contain at least a header line!")
return CSVFile(records[0], records[1:])
It can be used as shown below:
buggy_input = """"Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300
"""
reducer = Reducer(
CSV_GRAMMAR, buggy_input, read_csv_file, expected_exc_type=IndexError,
max_tries_for_generalization=100)
result = reducer.reduce(buggy_input)
self.assertEqual("""""
"";
""", result)
Currently there is no package control in the book. In order to make it work locally one has to launch the notebook (or make
it) and manually resolve missing package dependencies. Certainly, one can reuse requirements.txt file from binder
folder (though, it it not explicit and should be mentioned in the Readme), but it doesn’t have versions pinned, which may not only create problems in the future but also introduce errors if some old incompatible packages had been already installed.
To make it easier to set up the environment one may take virtualenv
tool, I would highly recommend to use either pyenv-virtualenv
or pipenv
.
The same issue applies to the code archives distributed via the https://www.debuggingbook.org/ website.
Using git submodules would be more consistent way to share the code between projects and keep things up-to-date.
The simplest way to refactor the current code is to leave soft links as it is, but point them to files in the submodule instead of external ../fuzzingbook
. Other suggestions are welcome.
The minimization implementation does not minimize as I would expect if all inputs cause the failure:
def asdf(s):
assert False
with DeltaDebugger() as dd:
asdf("test")
print(dd)
At the End of the Excursion slicer after the section Implementing Dynamic Instrumentation,
with Slicer(log=True) as slicer:
fun_2(10)
works but
def run_slicer():
with Slicer() as slicer:
fun_2(10)
run_slicer()
yields a ValueError: Cannot find with block
.
My initial guess is that starting_lineno
has to be subtracted from frame.f_lineno
in our_with_block
, although I'm not entirely sure that this works in all cases.
Thanks!
To reproduce:
$ pip install debuggingbook
$ python -c debuggingbook
fails with ModuleNotFoundError
Hey,
when I insert Generators into the Ochaichi Debugger, there are some issues when doing following:
debugger = OchaichiDebugger()
with debugger:
...
print(debugger)
U can see the issues when printing debugger, I don't know if he has done his job right, but the print is false.
MfG
VidmS
For the EventTracer
class, wouldn't it make sense to pass the condition as a Python function?
That is, rather than passing
with EventTracer(events=["c == '/'"]):
remove_html_markup('<b>foo</b>bar')
we would write
with EventTracer(events=[lambda: c == '/'"]):
remove_html_markup('<b>foo</b>bar')
(forwarded from Björn Mathis)
I get the above warning after running git status
:
$ git status
warning: unable to access '.gitignore': Too many levels of symbolic links
On branch master
Your branch is up to date with 'origin/master'.
nothing to commit, working tree clean
Indeed, it seems a bit convoluted:
$ ls -l .gitignore
lrwxr-xr-x 1 foo bar 16 Mar 22 16:12 .gitignore -> shared/gitignore
$ ls -l shared
lrwxr-xr-x 1 foo bar 16 Mar 22 16:12 shared -> notebooks/shared
Any chance to simplify things in order to get rid of the warning?
Running the "ChangeCounter" notebook on binder, the "pydriller" module is not available:
import pydriller
pydriller: not found
In case one wants to generate a book, html pages, or presentation from the provided notebooks, one can use the ipypublish
tool (as mentioned in the Makefile). Currently the soft link points to fuzzingbook/ipypublish
file from Fuzzingbook repo, which is missing (it is in .gitignore). Besides, this tool utilizes some custom rules stored in ipypublish_plugins
folder. However, the format of these plugins is not supported by the latest version of ipypublish (v0.10.12). The last supported version is quite old v0.6.8.
I would suggest to update these templates, or at least pin the version of ipypublish
in the Readme.
Say, following https://neumorphism.io/#e0e0e0
DeltaDebugger#ddmin
does not always find a 1-minimal output (as described here), in this particular case < >
is returned, but <>
also fails (with the same error, stacktrace, etc.) and is never checked.
I wrote a Jupyter notebook reproducing the error and demonstrating non-minimality. A smaller (maybe 1-minimal) example is generated further down using ddmin
; it takes about 33s on my machine and reduces input size to about 700 characters.
Running the 6th executable cell at the Repairer chapter on mybinder.org:
# ignore
_, first_lineno = inspect.getsourcelines(middle)
middle_source = inspect.getsource(middle)
print_content(middle_source, '.py', start_line_number=first_lineno)
Results in
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-9-92a85bc6daab> in <module>
1 # ignore
----> 2 _, first_lineno = inspect.getsourcelines(middle)
3 middle_source = inspect.getsource(middle)
4 print_content(middle_source, '.py', start_line_number=first_lineno)
/srv/conda/envs/notebook/lib/python3.6/inspect.py in getsourcelines(object)
953 raised if the source code cannot be retrieved."""
954 object = unwrap(object)
--> 955 lines, lnum = findsource(object)
956
957 if istraceback(object):
/srv/conda/envs/notebook/lib/python3.6/inspect.py in findsource(object)
784 lines = linecache.getlines(file)
785 if not lines:
--> 786 raise OSError('could not get source code')
787
788 if ismodule(object):
OSError: could not get source code
... instead of relying on existing Jupyter servers, like we do with mybinder.org
One candidate would be Jupyter Lite.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.