diff --git a/.divinerconfig b/.divinerconfig index 9a524fc..ef67505 100644 --- a/.divinerconfig +++ b/.divinerconfig @@ -1,34 +1,33 @@ { "name" : "libphutil", "src_link" : "https://secure.phabricator.com/diffusion/PHU/browse/master/%f$%l", "groups" : { "overview" : "Overview", "contrib" : "Contributing to libphutil", "working" : "Working with libphutil", "util" : "Core Utilities", "library" : "Phutil Library System", "utf8" : "UTF-8", "internationalization" : "Internationalization", "filesystem" : "Filesystem", "exec" : "Command Execution", "futures" : "Futures", "channel" : "Channels (I/O Wrappers)", "aws" : "Amazon Web Services", "error" : "Error Handling", "markup" : "Markup", "console" : "Console Utilities", "aast" : "Abstract Abstract Syntax Tree", "xhpast" : "XHPAST (PHP/XHP Parser)", "conduit" : "Conduit (Service API)", "event" : "Events", "daemon" : "Daemons", "parser" : "Other Parsers", "testcase" : "Test Cases" }, "engines" : [ ["DivinerArticleEngine", {}], ["DivinerXHPEngine", {}] ] } - diff --git a/scripts/build_xhpast.sh b/scripts/build_xhpast.sh index ffb0b4c..6678865 100755 --- a/scripts/build_xhpast.sh +++ b/scripts/build_xhpast.sh @@ -1,36 +1,36 @@ #!/bin/bash set -e ROOT=`dirname $0`"/../support/xhpast" USE_GMAKE=1 command -v gmake >/dev/null 2>&1 || USE_GMAKE=0 echo echo "Building XHPAST..." echo cd $ROOT if [ $USE_GMAKE -eq 1 ]; then - gmake clean all install + gmake clean all install else - make clean all install + make clean all install fi echo echo "Testing xhpast works" echo if ! ./xhpast --version then echo echo >&2 "xhpast is broken :(" echo >&2 "Maybe try \`STATIC=1 ./build_xhpast.sh\` if the error" echo >&2 "is linker-related?" echo exit 2 fi echo echo "Build successful!" echo diff --git a/scripts/format_log.php b/scripts/format_log.php index 7b9ef98..a527ef7 100755 --- a/scripts/format_log.php +++ b/scripts/format_log.php @@ -1,11 +1,10 @@ #!/usr/bin/env php $info) { $hash = $info['hash']; $type = $info['type']; $obj = $obj_root.hash_path($hash); $link = $hardened.'/'.$path; $dir = dirname($link); if (!is_dir($dir)) { $ok = mkdir($dir, 0777, $recursive = true); if (!$ok) { throw new Exception("Failed to make directory for '{$link}'!"); } } // We need to use actual symlinks in this case. if ($type == 'link') { $ok = symlink(readlink($soft.'/'.$path), $link); if (!$ok) { throw new Exception("Failed to create symlink '{$link}'!"); } continue; } if ($type == 'exec') { // Multiple hardlinks share a single executable bit, so we need to keep // executable versions separate from nonexecutable versions. $obj .= '.x'; } // Make copies of each object, obj.0, obj.1, etc., after there are too many // hardlinks. This can occur for the empty file, particularly. $n = 0; do { $stat = @lstat($obj.'.'.$n); if (!$stat) { break; } if ($stat[3] < 32000) { // TODO: On NTFS, this needs to be 1023. It is // not apparently trivial to determine if a disk // is NTFS or not, or what the link limit for a // disk is. On linux "df -T /path/to/dir" may be // useful, but on OS X this does something totally // different... break; } ++$n; } while (true); $obj = $obj.'.'.$n; if ($stat === false) { $ok = mkdir(dirname($obj), 0777, $recursive = true); if (!$ok) { throw new Exception("Failed to make directory for '{$obj}'."); } $ok = copy($soft.'/'.$path, $obj); if (!$ok) { throw new Exception("Failed to copy file '{$soft}/{$path}'!"); } if ($type == 'exec') { $ok = chmod($obj, 0755); if (!$ok) { throw new Exception("Failed to chmod file '{$obj}'!"); } } } $ok = link($obj, $link); if (!$ok) { throw new Exception("Failed to hardlink '{$obj}' to '{$link}'!"); } } // TODO: Replace link to soft directory with link to hardened directory. // execx('ln -sf %s %s', $dst, $hardened); echo $hardened."\n"; exit(0); function hash_path($hash) { return preg_replace('/([a-z0-9]{2})/', '\1/', $hash, 3); } function map_directory($dir) { try { if (Filesystem::pathExists($dir.'/.git')) { list($list) = execx( '(cd %s && git ls-tree -r --full-tree --abbrev=40 HEAD)', $dir); $list = trim($list); $list = explode("\n", $list); $map = array(); foreach ($list as $line) { $matches = null; $regexp = '/^(\d{6}) (\w+) ([a-z0-9]{40})\t(.*)$/'; if (!preg_match($regexp, $line, $matches)) { throw new Exception("Unable to parse line '{$line}'!"); } $flag = $matches[1]; $type = $matches[2]; $hash = $matches[3]; $file = $matches[4]; if ($type == 'commit') { // Deal with Git submodules. $submap = map_directory($dir.'/'.$file); foreach ($submap as $subfile => $info) { $map[$file.'/'.$subfile] = $info; } } else { $mask = (int)base_convert($flag, 8, 10); $type = 'file'; if ($mask & 0111) { echo "EXEC: {$file}\n"; $type = 'exec'; } else if (($mask & 0120000) == 0120000) { $type = 'link'; } $map[$file] = array( 'hash' => $hash, 'type' => $type, ); } } return $map; } } catch (Exception $ex) { phlog($ex); // Just drop down and go with the non-git approach. } $files = id(new FileFinder($dir)) ->withType('f') ->excludePath('*/.git/*') ->excludePath('*/.svn/*') ->find(); foreach ($files as $file) { if (!strncmp($file, './', 2)) { $file = substr($file, 2); } $data = Filesystem::readFile($dir.'/'.$file); $len = strlen($data); $hash = sha1("blob {$len}\0{$data}"); $type = 'file'; if (is_link($dir.'/'.$file)) { $type = 'link'; } else if (is_executable($dir.'/'.$file)) { $type = 'exec'; } $map[$file] = array( 'hash' => $hash, 'type' => $type, ); } return $map; } - diff --git a/src/docs/aws.diviner b/src/docs/aws.diviner index 3620324..9504336 100644 --- a/src/docs/aws.diviner +++ b/src/docs/aws.diviner @@ -1,8 +1,8 @@ @title Using Amazon Web Services APIs @group aws Don't use them. = Mega Alpha = -NOTE: These APIs are really sketchy right now. \ No newline at end of file +NOTE: These APIs are really sketchy right now. diff --git a/src/docs/drafts/dom_datastore.txt b/src/docs/drafts/dom_datastore.txt index b47d73c..1693de9 100644 --- a/src/docs/drafts/dom_datastore.txt +++ b/src/docs/drafts/dom_datastore.txt @@ -1,73 +1,71 @@ - This document describes what you should put in the DOM (display) and what you shouldn't (data). It is adapted from an e-mail to javascript@lists. =The DOM is Not a Datastore. Don't Store Application State in the DOM.= I don't know why this is so compelling, but it happens pretty often and everyone who is doing it really needs to stop doing it. What I mean by "application state" is that the DOM elements representing the page you're producing should be a rendered view of some state which you're storing internally in a Javascript object or datastructure. They should NOT themselves be the datastructure. Suppose you have a WidgetCounter, which counts widgets. Here's a reasonable implementation: function /* class */ WidgetCounter(display_field) { this.display = $(display_field); this.widgetCount = 0; this._redraw(); } WidgetCounter.prototype.getWidgetCount = function() { return this.widgetCount; } WidgetCounter.prototype.addWidget = function() { this.widgetCount++; this._redraw(); } WidgetCounter.prototype._redraw = function() { DOM.setContent(this.display, 'Widgets: '+this.widgetCount); } Sometimes, though, we'll get a design that looks like this: COUNTEREXAMPLE function /* class */ HorribleWidgetCounter(display_field) { this.display = $(display_field); DOM.setContent(this.display, 'Widgets: 0'); } HorribleWidgetCounter.prototype.getWigetCount = function() { return this.display.innerHTML.match(/\d+/)[0]; } HorribleWidgetCounter.prototype.addWidget = function() { DOM.setContent(this.display, 'Widgets: '+(this.getWidgetCount()+1)); } Even ignoring internationalization concerns, I hope this example is so egregiously bad that it speaks for itself. I don't think anyone would actually implement this, but we get many more subtle flavors of it. For example, the photo tagging code limits the number of tags to 30; it does this by counting the number of childNodes: COUNTEREXAMPLE if (tagsDiv.childNodes.length < 30 && ge(tagsID+'_error')) { $(tagsID+'_error').style.display = 'none'; } This practice is pervasive. A recent bug (circa July 2008) came down to a system storing its state not only in the DOM but in the "className" field. Someone changed some CSS on another page, which necessitated a followup CSS fix to deal with long field names, which cascaded into a minor CSS fix, which broke a portion of the underlying system. If a third-degree cascade of CSS-only changes can break your feature -- not by changing the display, but by changing actual execution paths -- you're doing it wrong. Two tangents here: First, it's also bad to rely on DOM context, like this (paraphrased but not really exaggerated) line that used to exist somewhere in poke: COUNTEREXAMPLE hide(pokeContainer.parentNode.parentNode.parentNode.parentNode.parentNode); "If there are no pokes left, hide the whole poke panel." (Someone removed a couple divs and this started hiding the BODY tag.) You should generally acquire references to nodes either by using $() or by using DOM.scry() from some parent container you've used $() on. The advantage of using DOM.scry() over DOM-walking chains is that the nodes can be moved around in a lot of ways without breaking you code. You should generally avoid using and particularly avoid chaining parentNode, childNodes, firstChild, nextSibling, etc. This isn't completely hard-and-fast, but almost all cases of "element.childNodes[3].firstChild" are bad and all of them are fragile. Second, it's way way less bad to use non-DOM properties of DOM objects to hold appropriate state. For instance, code like this (related to the earlier bug) is inherently fragile: COUNTEREXAMPLE - if (container.childNodes[i].className == 'friends_'+entity) + if (container.childNodes[i].className == 'friends_'+entity) It relies on unchanging DOM relationships, it relies on nothing and no one ever touching className, and it constructs a classname programmatically which makes it more difficult to debug when it breaks or find when it you're changing things. A far less bad implementation might use :DOMStorage: if (DOMStorage.getData(container.childNodes[i], 'privacyType') == entity) This is still kind of nasty and I'd argue that a better design would see the code iterating over objects instead of over DOM nodes, using getters, and calling rendering methods to make changes reflect in the DOM. But, at the least, this is much less fragile and usually a practical alternative which doesn't require all that time-consuming "software engineering" associated with using classes and objects. - diff --git a/src/docs/drafts/javascript_scope_resolution.txt b/src/docs/drafts/javascript_scope_resolution.txt index 5c9a8ce..187abb7 100644 --- a/src/docs/drafts/javascript_scope_resolution.txt +++ b/src/docs/drafts/javascript_scope_resolution.txt @@ -1,149 +1,148 @@ This article describes how scope resolution works in Javascript. Note that Firebug creates a fake global scope, so if you try some of these examples in Firebug they may behave differently than if you put them in a .js file. =window= Javascript's global scope is a variable named window. At global scope, these statements are all equivalent: u = 1; var u = 1; window.u = 1; Everything you define at global scope exists at global scope, so you can iterate over window to get a list of all the functions and classes, plus a whole pile of browser builtins. For instance, to test if a function exists at global scope: if (window['function']) { ... } =Scope= There are two ways to create a new scope: function and with. Never use with -- it's a cute idea, but some of the browser implementations suck and it's more trouble than it's worth (see [[Articles/Javascript Design Flaws]]). Other statements do not introduce scope in Javascript. So there's one way to create a new scope that you are actually allowed to use: creating a function. A function definition creates a new scope: function f(param) { var local; param = 'param'; local = 'local'; global = 'global'; } Local variables and parameter variables have resolution precedence over global variables, so param is resolved to the parameter and local is resolved to the function-local. However, global is not declared anywhere in this scope, so it will resolve to the global global. Equivalently, you could write window.global. In general, you can force global resolution with window. Two notes: first, don't ever give a local and a parameter the same name. You will be decidedly unhappy about the result. Second, the var keyword applies regardless of where in the function scope it appears, so this block also creates a local called local, even though the line var local; will never be executed. That is, this function will always return undefined regardless of the value of window.local. function f() { return local; if (false) { var local; } } It may be helpful to think of this as: one pass to find all the var keywords, then a second pass to actually execute the code. =Advanced Scope= Of course, you can define functions inside functions. In this case, identifiers which are not locals or parameters will be resolved at the parent scope, and so on until window is reached. function f() { var l = 35; function g() { return l; } g(); // 35 } Here, l is resolved at f's scope. Note that g is a ``closure'' -- it encloses a variable from its containing scope. This reference does not dissolve when the scope dissolves: function buildBeancounterFunctions() { var bean_count = 0; return { inc : function() { bean_count++; }, get : function() { return bean_count; } }; } var b = buildBeancounterFunctions(); b.inc(); b.inc(); b.get(); // 2 This works as expected; the reference to the bean_count local is preserved by the closures even though the scope created by buildBeancounterFunctions has dissolved. If you call buildBeancounterFunctions again, you'll get a different set of functions enclosing a different variable. (But don't build classes like this, see [[Articles/Object-Oriented Javascript]] instead.) =Really Advanced Scope= One caveat is that this goes the other way, too: function assignButtonHandlers(buttons) { for (var ii = 0; ii < buttons.length; ii++) { buttons[ii].onclick = function() { alert(ii); } } } Suppose you pass in an array of three buttons. Since all of the closures are referencing the same variable, the buttons will not alert 0, 1, and 2, but 3, 3, and 3. The solution to this is to reference different variables; to do this, you need to introduce more scope. function assignButtonHandlers(buttons) { for (var ii = 0; ii < buttons.length; ii++) { buttons[ii].onclick = (function(n) { return function() { alert(n); })(ii); } } } This creates and calls a function which takes the current value of the iterator and returns a function which encloses that value. This is difficult to understand. But generally you can just learn how bind() works and use that instead, far more simply: function assignButtonHandlers(buttons) { for (var ii = 0; ii < buttons.length; ii++) { buttons[ii].onclick = alert.bind(buttons[ii], ii); } } =arguments and this= When you create a function scope, two magic variables are automatically injected: this and arguments. arguments is an Array-like object with the function's arguments (so you can write functions which take a variable number of arguments, like sprintf()). arguments also has some other properties, like callee, which are occasionally useful. Look it up on the internet if you really care so very much. function howManyArguments() { return arguments.length; } howManyArguments(1, 2, 3, 4, 5); // 5 this is a variable that contains the function's calling context. What this means is that if a function is invoked as a property of an object, this will be the object which it is a property of. Basically, what you would expect from other object-oriented languages. o.f(); // inside the scope of f(), `this' is `o' o['f'](); // inside the scope of f(), `this' is `o' a.b.c.f(); // inside the scope of f(), `this' is `c' But! If a function has no calling context, this will be defined but set to window. So, this always exists inside a function scope, but sometimes it is window which is 100% Bad News Bears and almost certainly not what you want. f(); // `this' is window! That is terrible! This is particularly tricky because it is the immediate calling context that becomes this. o.f(); // inside the scope of f(), `this' is `o', but... var g = o.f; g(); // ...now it's window, sucker. Fortunately, you can inject a specific calling context using Function.call() or Function.apply(). =call and apply= Function.call() takes one or more arguments; the first is the object to inject as the function's calling context and the rest (if any) are arguments to pass. function add(n) { return this + n; } add.call(3, 5); // 8 Function.apply() works the same way, but it takes two arguments: the object to inject as this and an array of arguments to pass. So these are equivalent: add.call(3, 5); add.apply(3, [5]); But, there's generally an easier way than call or apply: bind() (which you should probably read next). - diff --git a/src/docs/drafts/using_exceptions.txt b/src/docs/drafts/using_exceptions.txt index 50985dd..4bcef94 100644 --- a/src/docs/drafts/using_exceptions.txt +++ b/src/docs/drafts/using_exceptions.txt @@ -1,117 +1,116 @@ This document explains how to use exceptions to handle error conditions in PHP. You should also read the newer guidelines at: http://www.intern.facebook.com/intern/wiki/index.php/PHPErrorHandling. = Use Exceptions = Use Exceptions: Error conditions in our PHP stack should be communicated through exceptions, which provide a simple, robust mechanism for handling errors. In some cases, extenuating circumstances (like legacy code which is difficult to untangle or needs to be phased in) may prevent immediate enactment, but exception-based error handling should be strongly preferred wherever reasonable, and particularly in new features. = Overview = Traditionally error handling, including much of the error handling in our codebase, revolves around returning error codes. Return codes are straightforward, but they need to be manually returned, and then manually passed up the stack. They need to be handled at every level. If they aren't, they silently vanish. The behavior of error codes is pathological: they resist doing the right thing. Unless they are handled at every level they will leap headlong into the void, never to be heard from again. Exceptions are a more powerful error handling mechanism than error codes. They have the right default behaviors: they can effect function return, and they make their way up the stack unaided by default. As a top-level caller using an API that throws exceptions, you must handle error conditions, because ignoring them means program termination. This relationship between callers (which are in a position to handle errors) and lower level APIs (which are not), which is explored in greater detail below, is both correct and desirable. Only callers can be responsible for error handling, and exceptions ensure they are. Using exceptions will make your life easier and your code better, because: * exceptions simplify error handling because they can effect function return and unwind the stack without coddling * exceptions make your code robust by preventing errors from silently vanishing * exceptions carry more information about error conditions, so you can better react to them * exceptions make error conditions explicit by providing a single, unambiguous way to communicate errors * exceptions simplify APIs by allowing you to use return values for returning data only However, handling error conditions with exceptions instead of error codes requires you to change your approach somewhat in order to reap the benefits: * you should catch exceptions at the highest level that can deal with them, often the top level * you should use exceptions to indicate exceptional conditions, not to affect control flow * you should not catch the base class Exception (except in very special circumstances) The remainder of this document explains these points in greater detail. =Simplified Error Handling= Exceptions simplify error handling by reducing the number of checks you need to make against return codes. Generally, exceptions allow you to omit many checks against return values and all the boilerplate code responsible for pushing error codes up the stack. Exceptions are explicitly able to effect function return and make their way up the stack without help. =Robustness= Exceptions make your code more robust because they prevent error codes from being silently dropped. With traditional error codes, any caller can forget to check them. To counteract this, we've developed the "debug_rlog()-and-return" idiom: COUNTEREXAMPLE $ok = some_function(); if (!$ok) { debug_rlog('Something didn't work!'); // Now I'm covered if my caller is lazy! Also, if I'm the caller, I don't // have to handle this error because it has already been "handled"! Great! return false; } This idiom arose as a compromise between two concerns: you can't handle errors properly from deep in the stack, but you need to make sure they get handled. This idiom mitigates both but solves neither: callers feel okay about ignoring errors because they've already been ``handled'' through logging, and it's easy to ignore them because there are no real consequences. The right way to resolve this is to throw an exception. Throwing sends an exception scurrying toward top level where it can be dispatched properly, and forces it to be handled or the program will terminate. This means no log spew and a guarantee that callers aren't ignoring error conditions. A concern sometimes raised about exceptions is that an uncaught exception that escapes the stack causes program termination. But, this is an extremely desirable behavior. Stated another way, exceptions mean that when your program is wrong it stops. Error codes mean that when your program is wrong, it keeps going, it just does the wrong thing. Doing the wrong thing is a much worse behavior. Unexpected program termination is not particularly bad. It's obvious, it is less likely to make it through testing in the first place, and even if it does it will show up in the logs and get fixed quickly. Most importantly, its badness is bounded: even in the worst case, it can't do more than bring the site down for a few minutes. This is already a condition we have to deal with even without exceptions, because program termination can be caused in a number of other ways (like calling a function which is not defined). Conversely, silent failure is very difficult to detect so it is more likely to make it to production in the first place. It may not show up in the logs once it's live. Most importantly, continuing execution when the program is wrong is unboundedly bad. Data corruption and privacy violations are much, much worse than bringing the site down. They can take weeks to clean up, or be impossible to completely revert. It takes only minutes to revert a bad push that caused program termination via uncaught exception. Although the risk of these severe outcomes is small, the risk of taking the site down is also small and these severe outcomes are much worse than the worst possible case of uncaught exceptions. Using exceptions and coding defensively is essentially insurance: you are accepting an increased risk of program termination and other well-defined failures that are easy to deal with because you want to lower the risk of nebulous failures that are difficult or impossible to deal with. This is a highly desirable tradeoff which you should be eager to make. We already have a number of libraries that communicate error conditions through exceptions or react gracefully in the presence of exceptions: for examples, see :AsyncResponse, :CodedException, :queryfx() ([[Articles/Using queryfx()]]), :Filesystem, Hypershell ([[Articles/Hypershell Architecture]]), Hyperpush ([[Articles/Hyperpush Architecture]]), Sitevars ([[Articles/Sitevar Architecture]]), and :execx() ([[Articles/System Commands]]). Far from being harbingers of uncaught-exception doom, these systems have resulted in simplified code and increased reliability. =Catch At Top Level= When you catch exceptions, you should generally catch them at the highest level where you can correctly handle the error. Often, this is at or very near the top level. It is usually wrong to convert an :Exception to a return code: COUNTEREXAMPLE try { $this->flip(); } catch (PancakeException $ex) { return false; } Converting an exception to a return code throws away much of the power of exceptions (e.g., automatic navigation up the stack, guaranteed handling, additional information). However, you may sometimes need to do this as an interim step in the process of converting a legacy API into an exception-oriented API. It is almost certainly wrong to convert an Exception into a debug_rlog(): COUNTEREXAMPLE try { $this->flip(); } catch (PancakeException $ex) { debug_rlog('Couldn't flip pancake.'); } This is basically equivalent to: COUNTEREXAMPLE $err = pancake_flip(); if ($err) { debug_rlog('Hurf durf I am absolving myself of responsibility.'); $err = false; // No more error! Magic! } Instead, you should catch and handle exceptions at the top level, where they can be meaningfully acted upon. try { $pancake = new Pancake(); $pancake->cook(); $breakfast = $pancake; // Yum! } catch (PancakeException $ex) { $breakfast = $cereal; // Ick. :( } =Don't Catch "Exception"= You should usually avoid catching :Exception unless you are implementing a very general, top-level exception handling mechanism like the one in :AsyncResponse. Instead, catch a specific exception or exception subtree, like :CodedException, :QueryException, :CommandException, or :FilesystemException. A corollary to this is that you should avoid throwing :Exception unless you do not expect any caller to handle the exception. Essentially, throwing Exception is guaranteeing program termination (albeit via a graceful stack unwind and sensible top-level behavior rather than abrupt exit). The major use for this is checking invariants to detect that an API is being misused so you can communicate to the caller that they have strictly and unambiguously abused your interface. Because PHP has no finally clause, it is acceptable to catch :Exception if you are cleaning up resources and then re-throwing, although most kinds of resources that need cleanup (like database transactions and temporary files) already have exception-aware APIs that will handle this for you. - diff --git a/src/docs/drafts/using_queryfx.txt b/src/docs/drafts/using_queryfx.txt index 25c1a92..1f85ad2 100644 --- a/src/docs/drafts/using_queryfx.txt +++ b/src/docs/drafts/using_queryfx.txt @@ -1,182 +1,180 @@ - This document describes how to use queryfx(), an extended form of queryf(). = What queryfx() Does = queryfx() stands for something like "query, formatted + extensions", (or "exceptions", depending on who you ask) and is a method for easily and correctly executing queries against a MySQL database. queryfx() is similar to queryf(), but provides more conversions and better error handling. The API is similar to the sprintf() family of functions. resource queryfx(managed_connection $conn, string $query_pattern, ...); Example usage might look like this. $ret = queryfx($conn_w, 'INSERT INTO stuff (name) VALUES (%s)', $name); queryfx() will properly escape parameters so they are safe in SQL and protect you from SQL injection holes. = queryfx_one() and queryfx_all() = Many queries either expect to select exactly one row, or want to select all result rows as a list of dictionaries. In these cases, you may use queryfx_one() or queryfx_all(), respectively. queryfx_one() will return either null if zero rows match, or a dictionary if exactly one row matches. If more than one row matches, a QueryCountException will be thrown. $pie = queryfx_one($conn_r, 'SELECT * FROM pie WHERE id = %d', $id); if ($pie) { echo "The pie's flavor is {$pie['flavor']}."; } else { echo 'No such pie exists. This is sad times.'; } queryfx_all() will always return a list of dictionaries, although it may be empty. $list = queryfx_all( $conn_r, 'SELECT * FROM pie WHERE baked > %d', time() - (60 * 60)); if (count($list)) { echo 'Pies baked in the last hour: '.count($list); } else { echo 'No pies were baked in the last hour. This is sad times indeed.'; } These convenience wrappers don't cover every case, but can simplify your code in many cases. = Supported Conversions = queryfx() supports three simple conversions, %d (integer), %s (string), and %f (float). These work exactly like sprintf(), except that they will be properly escaped for a MySQL context. $res = queryfx( $conn_w, 'INSERT INTO pie (flavor, size) values (%s, %d)', $pie_flavor, $pie_size); Note that %s is binary-safe, so it is safe to convert UTF-8 strings or raw byte sequences using %s. In addition to these simple conversions, a wide array of additional conversions is supported. Nullable conversions work like the simple conversions but handle NULL properly. The nullable conversions are %nd, %ns, and %ns. These conversions behave exactly like the corresponding normal conversions, except when the value they are passed is a strict null. In this case, they will print NULL. // INSERT INTO t (u, v) VALUES (3, NULL) queryfx($conn_w, 'INSERT INTO t (u, v) VALUES (%nd, %nd)', 3, null); Nullable test conversions work like the simple conversions but handle equivalence with NULL properly by printing either an = or an IS NULL clause. The nullable test conversions are %=d, %=s, and %=f. // SELECT * FROM t WHERE u = 3 AND v IS NULL queryfx($conn_r, 'SELECT * FROM t WHERE u %=d AND v %=d, 3, null); List conversions accept a list of values and produce a comma-separated list, appropriate for insertion into an IN clause. The list conversions are %Ld, %Ls, and %Lf. Note: these conversions treat their arguments as nullable, so null will be converted to NULL in the query, not 0 or empty string. // SELECT * FROM t WHERE u IN ('a', 'b', 'c') queryfx($conn_r, 'SELECT * FROM t WHERE u IN (%Ls)', array('a', 'b', 'c')); Identifier conversions escape SQL identifiers like table or column names. The identifier conversions are %T (table), %C (column) and %LC (list of columns). // SELECT `select` FROM `from` WHERE `where` = 4 queryfx( $conn_r, 'SELECT %C FROM %T WHERE %C = %d', 'select', 'from', 'where', 4); Dictionary conversions escape a dictionary of key-value pairs into column-value pairs. The dictionary conversions are %U (update clause), %LA (list of predicates joined by AND), and %LO (list of predicates joined by OR). %LA and %LO also support array values for generating "IN" clauses. // UPDATE t SET a = 1, b = 2, c = 3 WHERE u = 5 AND d = 6 AND e IN (1, 2, 3) queryfx( $conn_w, 'UPDATE t SET %U WHERE %LA', array('a' => 1, 'b' => 2, 'c' => 3), array('u' => 5, 'd' => 6, 'e' => array(1, 2, 3))); Like conversions escape a string for a LIKE (or NOT LIKE) clause. The like conversions are %~ (substring match), %> (prefix match), and %< (suffix match). // SELECT * FROM t WHERE u LIKE '%example%' OR v LIKE 'prefix%' queryfx( $conn_w, 'SELECT * FROM t WHERE u LIKE %~ OR v LIKE %>', 'example', 'prefix'); Miscellaneous conversions escape other random junk. The miscellaneous conversions are %K (comment) and %Q (raw subquery). You must be extremely careful with %Q -- unlike other conversions, it does no escaping. If you do not use it properly, you will open a SQL injection hole. // UPDATE /* hey guys what is up */ t SET u = "v" queryfx( $conn_w, 'UPDATE %K t SET %Q', 'hey guys what is up', 'u = "v"'); Be careful with %Q because it's extremely dangerous. It should be rarely (if ever) used. Often, vqueryfx() is a better approach. = Handling Exceptions = queryfx() throws exceptions on failure, which means that you need to catch and handle them. All exceptions descend from QueryException. Generally, you should catch QueryExceptions at or near the top level -- that is, catching exceptions high in the callstack is often better than catching them low in the callstack. try { $pies = pies_get_for_user($uid); } catch (QueryException $ex) { $errmsg = 'Pies are not available at this time, try again later.'; } You should not catch exceptions in your database/generator function. You can't do anything useful with them here. COUNTEREXAMPLE try { $ret = queryfx($conn_r, 'SELECT * FROM pie WHERE owner = %d', $uid); } catch (QueryException $ex) { return null; } This means that writing generators is much easier: function pies_for_users_dbget($user_ids) { $rows = queryfx_all( pies_get_conn('r'), 'SELECT * FROM pies WHERE owner IN (%Ld)', $user_ids); return array_group($rows, 'owner') + array_fill_keys($user_ids, array()); } This is a complete, correct database/generator function under queryfx(). Notably, you do not need to test either the connection or the return for `null', because exceptions will be thrown in either case. Note that the cache_get_scb() layer will properly catch and handle exceptions thrown by queryfx(). There are several kinds of :QueryException: QueryException Abstract base class for all query exceptions. QueryParameterException (extends QueryException) A parameter to the query was wrong: for instance, an empty list was passed to a %Ld conversion, or the connection was `null'. QueryErrorException (extends QueryException) The database returned with an error code not specifically recognized as recoverable. QueryCountException (extends QueryException) You issued a queryfx_one() call that returned more than one row. There are also several exceptions that are considered recoverable: RecoverableQueryException (extends QueryException) Abstract base class for all "recoverable" exceptions; these are nonpermanent failures. QueryDeadlockException (extends RecoverableQueryException) The database returned, reporting a deadlock. The correct response to a deadlock condition is often to retry the query. QueryConnectionException (extends RecoverableQueryException) The MySQL connection dropped. This shouldn't happen with ManagedConnections, but you may run into it if you aren't using ManagedConnections. QueryDuplicateKeyException (extends RecoverableQueryException) You issued an insert or update statement which would have caused a duplicate key (on the primary key or some unique key) collision. Attempting the insert and catching this exception is often the correct way to ensure uniqueness. In most cases, it is sufficient to catch QueryException and consider it an unrecoverable error at the top level. However, you may find the fine-grained exceptions useful when building abstractions, debugging, or under unusual use cases. One caveat is that memcache_dispatch() is exception-aware but can not currently expose exceptions at the top level. Instead, it will convert QueryExceptions into an implicity null return value from your database/generator function. This may be fixed in the future but requires improving some abstractions. - diff --git a/src/docs/overview.diviner b/src/docs/overview.diviner index 32831e0..5e8a1dc 100644 --- a/src/docs/overview.diviner +++ b/src/docs/overview.diviner @@ -1,55 +1,55 @@ @title libphutil Overview @group overview This document provides a high-level introduction to libphutil. = Overview = **libphutil** (pronounced as "lib-futile", like the English word //futile//) is a collection of PHP utility classes and functions which provide powerful extensions to the standard library. This code was originally developed at Facebook and parts of it appear in the core libraries for . libphutil is principally the shared library for **Arcanist** and **Phabricator** (see ), but is suitable for inclusion in other projects. In particular, some of the classes provided in this library vastly improve the state of common operations in PHP, like executing system commands. = Loading libphutil = To include libphutil in another project, include the ##src/__phutil_library_init__.php## file: require_once 'path/to/libphutil/src/__phutil_library_init__.php'; This loads global functions and registers an autoload function with ##spl_autoload_register()##, so you can also use classes. = Major Components = The major components of libphutil are: - - **Core Utilties**: a collection of useful functions like @{function:ipull} + - **Core Utilities**: a collection of useful functions like @{function:ipull} which simplify common data manipulation; - **Filesystem**: classes like @{class:Filesystem} which provide a strict API for filesystem access and throw exceptions on failure, making it easier to write robust code which interacts with files; - **Command Execution**: libphutil provides a powerful system command primitive in @{class:ExecFuture} which makes it far easier to write command-line scripts which execute system commands (see @{article:Command Execution}); - **@{function:xsprintf}**: allows you to define ##sprintf()##-style functions which use custom conversions; and - **Library System**: an introspectable, inventoried system for organizing PHP code and managing dependencies, supported by static analysis. = Extending and Contributing = Information on extending and contributing to libphutil is available in the Phabricator documentation: - to get started as a contributor, see @{article@phabricator:Contributor Introduction}. diff --git a/src/docs/using_futures.diviner b/src/docs/using_futures.diviner index 7dc9969..a554fd6 100644 --- a/src/docs/using_futures.diviner +++ b/src/docs/using_futures.diviner @@ -1,91 +1,91 @@ @title Using Futures @group futures Overview of how futures work in libphutil. = Overview = Futures (also called "Promises") are objects which represent the result of some pending computation (like executing a command or making a request to another server), but don't actually hold that result until the computation finishes. They are used to simplify parallel programming, since you can pass the future around as a representation for the real result while the real result is being computed in the background. When the object is asked to return the actual result, it blocks until the result is available. libphutil provides a number of future-based APIs, as they strike a good balance between ease of use and power for many of the domains where PHP is a reasonable language choice. Each type of future is used to do a different type of computation (for instance, @{class:ExecFuture} executes system commands while @{class:HTTPFuture} executes HTTP requests), but all of them behave in a basically similar way and can be manipulated with the same top-level constructs. = Basics = You create a future by instantiating the relevant class and ask it to return the result by calling ##resolve()##: $gzip_future = new ExecFuture("gzip %s", $some_file); $gzip_future->start(); // The future is now executing in the background, and you can continue // doing computation in this process by putting code here. list($err, $stdout, $stderr) = $gzip_future->resolve(); When you call ##resolve()##, the future blocks until the result is ready. You can test if a future's result is ready by calling ##isReady()##: $is_ready = $gzip_future->isReady(); Being "ready" indicates that the future's computation has completed and it will not need to block when you call ##resolve()##. Note that when you instantiate a future, it does not immediately initiate computation. You must call ##start()##, ##isReady()## or ##resolve()## to activate it. If you simply call ##resolve()## it will start, block until it is complete, and then return the result, acting in a completely synchronous way. See @{article:Command Execution} for more detailed documentation on how to execute system commands with libphutil. = Managing Multiple Futures = Commonly, you may have many similar tasks you wish to parallelize: instead of compressing one file, you want to compress several files. You can use the @{class:FutureIterator} class to manage multiple futures, via the convenience function @{function:Futures}. $futures = array(); foreach ($files as $file) { $futures[$file] = new ExecFuture("gzip %s", $file); } foreach (Futures($futures) as $file => $future) { list($err, $stdout, $stderr) = $future->resolve(); if (!$err) { echo "Compressed {$file}...\n"; } else { echo "Failed to compress {$file}!\n"; } } @{function:Futures} takes a list of futures and runs them in parallel, **returning them in the order they resolve, NOT the original list order**. This allows your program to begin any followup computation as quickly as possible: if the slowest future in the list happens to be the first one, you can finish processing all the other futures while waiting for it. You can also limit how many futures you want to run at once. For instance, to process no more than 4 files simultaneously: foreach (Futures($futures)->limit(4) as $file => $future) { // ... } Consult the @{class:FutureIterator} documentation for detailed information on -class capabilities. \ No newline at end of file +class capabilities. diff --git a/src/events/PhutilEvent.php b/src/events/PhutilEvent.php index f1ebec7..12d98da 100644 --- a/src/events/PhutilEvent.php +++ b/src/events/PhutilEvent.php @@ -1,45 +1,40 @@ type = $type; $this->data = $data; } public function getType() { return $this->type; } public function getValue($key, $default = null) { return idx($this->data, $key, $default); } public function setValue($key, $value) { $this->data[$key] = $value; return $this; } public function stop() { $this->stop = true; return $this; } public function isStopped() { return $this->stop; } } - - - - - diff --git a/src/filesystem/FileFinder.php b/src/filesystem/FileFinder.php index a762c4a..d60fbab 100644 --- a/src/filesystem/FileFinder.php +++ b/src/filesystem/FileFinder.php @@ -1,269 +1,268 @@ withType('f') * ->withSuffix('php') * ->find(); * * @task create Creating a File Query * @task config Configuring File Queries * @task exec Executing the File Query * @task internal Internal * @group filesystem */ final class FileFinder { private $root; private $exclude = array(); private $paths = array(); private $suffix = array(); private $type; private $generateChecksums = false; private $followSymlinks; private $forceMode; /** * Create a new FileFinder. * * @param string Root directory to find files beneath. * @return this * @task create */ public function __construct($root) { $this->root = rtrim($root, '/'); } /** * @task config */ public function excludePath($path) { $this->exclude[] = $path; return $this; } /** * @task config */ public function withSuffix($suffix) { $this->suffix[] = '*.'.$suffix; return $this; } /** * @task config */ public function withPath($path) { $this->paths[] = $path; return $this; } /** * @task config */ public function withType($type) { $this->type = $type; return $this; } /** * @task config */ public function withFollowSymlinks($follow) { $this->followSymlinks = $follow; return $this; } /** * @task config */ public function setGenerateChecksums($generate) { $this->generateChecksums = $generate; return $this; } /** * @task config * @param string Either "php", "shell", or the empty string. */ public function setForceMode($mode) { $this->forceMode = $mode; return $this; } /** * @task internal */ public function validateFile($file) { $matches = (count($this->suffix) == 0); foreach ($this->suffix as $curr_suffix) { if (fnmatch($curr_suffix, $file)) { $matches = true; break; } } if (!$matches) { return false; } $matches = (count($this->paths) == 0); foreach ($this->paths as $path) { if (fnmatch($path, $this->root.'/'.$file)) { $matches = true; break; } } $fullpath = $this->root.'/'.ltrim($file, '/'); if (($this->type == 'f' && is_dir($fullpath)) || ($this->type == 'd' && !is_dir($fullpath))) { $matches = false; } return $matches; } /** * @task internal */ private function getFiles($dir) { $found = Filesystem::listDirectory($this->root.'/'.$dir, true); $files = array(); if (strlen($dir) > 0) { $dir = rtrim($dir, '/').'/'; } foreach ($found as $filename) { // Only exclude files whose names match relative to the root. if ($dir == "") { $matches = true; foreach ($this->exclude as $exclude_path) { if (fnmatch(ltrim($exclude_path, './'), $dir.$filename)) { $matches = false; break; } } if (!$matches) { continue; } } if ($this->validateFile($dir.$filename)) { $files[] = $dir.$filename; } if (is_dir($this->root.'/'.$dir.$filename)) { foreach ($this->getFiles($dir.$filename) as $file) { $files[] = $file; } } } return $files; } /** * @task exec */ public function find() { $files = array(); if (!is_dir($this->root) || !is_readable($this->root)) { throw new Exception( "Invalid FileFinder root directory specified ('{$this->root}'). ". "Root directory must be a directory, be readable, and be specified ". "with an absolute path."); } if ($this->forceMode == "shell") { $php_mode = false; } else if ($this->forceMode == "php") { $php_mode = true; } else { $php_mode = (phutil_is_windows() || !Filesystem::binaryExists('find')); } if ($php_mode) { $files = $this->getFiles(""); } else { $args = array(); $command = array(); $command[] = 'find'; if ($this->followSymlinks) { $command[] = '-L'; } $command[] = '.'; if ($this->exclude) { $command[] = $this->generateList('path', $this->exclude).' -prune'; $command[] = '-o'; } if ($this->type) { $command[] = '-type %s'; $args[] = $this->type; } if ($this->suffix) { $command[] = $this->generateList('name', $this->suffix); } if ($this->paths) { $command[] = $this->generateList('path', $this->paths); } $command[] = '-print0'; array_unshift($args, implode(' ', $command)); list($stdout) = newv('ExecFuture', $args) ->setCWD($this->root) ->resolvex(); $stdout = trim($stdout); if (!strlen($stdout)) { return array(); } $files = explode("\0", $stdout); // On OSX/BSD, find prepends a './' to each file. for ($i = 0; $i < count($files); $i++) { if (substr($files[$i], 0, 2) == './') { $files[$i] = substr($files[$i], 2); } } } if (!$this->generateChecksums) { return $files; } else { $map = array(); foreach ($files as $line) { $fullpath = $this->root.'/'.ltrim($line, '/'); if (is_dir($fullpath)) { $map[$line] = null; } else { $map[$line] = md5_file($fullpath); } } return $map; } } /** * @task internal */ private function generateList($flag, array $items) { $items = array_map('escapeshellarg', $items); foreach ($items as $key => $item) { $items[$key] = '-'.$flag.' '.$item; } $items = implode(' -o ', $items); return '"(" '.$items.' ")"'; } } - diff --git a/src/filesystem/Filesystem.php b/src/filesystem/Filesystem.php index 25d7963..c463be3 100644 --- a/src/filesystem/Filesystem.php +++ b/src/filesystem/Filesystem.php @@ -1,1058 +1,1057 @@ > 3]; } return $result; } /** * Identify the MIME type of a file. This returns only the MIME type (like * text/plain), not the encoding (like charset=utf-8). * * @param string Path to the file to examine. * @param string Optional default mime type to return if the file's mime * type can not be identified. * @return string File mime type. * * @task file * * @phutil-external-symbol function mime_content_type * @phutil-external-symbol function finfo_open * @phutil-external-symbol function finfo_file */ public static function getMimeType( $path, $default = 'application/octet-stream') { $path = self::resolvePath($path); self::assertExists($path); self::assertIsFile($path); self::assertReadable($path); $mime_type = null; // Fileinfo is the best approach since it doesn't rely on `file`, but // it isn't builtin for older versions of PHP. if (function_exists('finfo_open')) { $finfo = finfo_open(FILEINFO_MIME); if ($finfo) { $result = finfo_file($finfo, $path); if ($result !== false) { $mime_type = $result; } } } // If we failed Fileinfo, try `file`. This works well but not all systems // have the binary. if ($mime_type === null) { list($err, $stdout) = exec_manual( 'file --brief --mime %s', $path); if (!$err) { $mime_type = trim($stdout); } } // If we didn't get anywhere, try the deprecated mime_content_type() // function. if ($mime_type === null) { if (function_exists('mime_content_type')) { $result = mime_content_type($path); if ($result !== false) { $mime_type = $result; } } } // If we come back with an encoding, strip it off. if (strpos($mime_type, ';') !== false) { list($type, $encoding) = explode(';', $mime_type, 2); $mime_type = $type; } if ($mime_type === null) { $mime_type = $default; } return $mime_type; } /* -( Directories )-------------------------------------------------------- */ /** * Create a directory in a manner similar to mkdir(), but throw detailed * exceptions on failure. * * @param string Path to directory. The parent directory must exist and * be writable. * @param int Permission umask. Note that umask is in octal, so you * should specify it as, e.g., `0777', not `777'. By * default, these permissions are very liberal (0777). * @param boolean Recursivly create directories. Default to false * @return string Path to the created directory. * * @task directory */ public static function createDirectory($path, $umask = 0777, $recursive = false) { $path = self::resolvePath($path); if (is_dir($path)) { Filesystem::changePermissions($path, $umask); return $path; } $dir = dirname($path); if ($recursive && !file_exists($dir)) { // Note: We could do this with the recursive third parameter of mkdir(), // but then we loose the helpful FilesystemExceptions we normally get. self::createDirectory($dir, $umask, true); } self::assertIsDirectory($dir); self::assertExists($dir); self::assertWritable($dir); self::assertNotExists($path); if (!mkdir($path, $umask)) { throw new FilesystemException( $path, "Failed to create directory `{$path}'."); } // Need to change premissions explicitly because mkdir does something // slightly different. mkdir(2) man page: // 'The parameter mode specifies the permissions to use. It is modified by // the process's umask in the usual way: the permissions of the created // directory are (mode & ~umask & 0777)."' Filesystem::changePermissions($path, $umask); return $path; } /** * Create a temporary directory and return the path to it. You are * responsible for removing it (e.g., with Filesystem::remove()) * when you are done with it. * * @param string Optional directory prefix. * @param int Permissions to create the directory with. By default, * these permissions are very restrictive (0700). * @return string Path to newly created temporary directory. * * @task directory */ public static function createTemporaryDirectory($prefix = '', $umask = 0700) { $prefix = preg_replace('/[^A-Z0-9._-]+/i', '', $prefix); $tmp = sys_get_temp_dir(); if (!$tmp) { throw new FilesystemException( $tmp, 'Unable to determine system temporary directory.'); } $base = $tmp.DIRECTORY_SEPARATOR.$prefix; $tries = 3; do { $dir = $base.substr(base_convert(md5(mt_rand()), 16, 36), 0, 16); try { self::createDirectory($dir, $umask); break; } catch (FilesystemException $ex) { // Ignore. } } while (--$tries); if (!$tries) { $df = disk_free_space($tmp); if ($df !== false && $df < 1024 * 1024) { throw new FilesystemException( $dir, pht("Failed to create a temporary directory: the disk is full.")); } throw new FilesystemException( $dir, pht("Failed to create a temporary directory in '%s'.", $tmp)); } return $dir; } /** * List files in a directory. * * @param string Path, absolute or relative to PWD. * @param bool If false, exclude files beginning with a ".". * * @return array List of files and directories in the specified * directory, excluding `.' and `..'. * * @task directory */ public static function listDirectory($path, $include_hidden = true) { $path = self::resolvePath($path); self::assertExists($path); self::assertIsDirectory($path); self::assertReadable($path); $list = @scandir($path); if ($list === false) { throw new FilesystemException( $path, "Unable to list contents of directory `{$path}'."); } foreach ($list as $k => $v) { if ($v == '.' || $v == '..' || (!$include_hidden && $v[0] == '.')) { unset($list[$k]); } } return array_values($list); } /** * Return all directories between a path and "/". Iterating over them walks * from the path to the root. * * @param string Path, absolute or relative to PWD. * @return list List of parent paths, including the provided path. * @task directory */ public static function walkToRoot($path) { $path = self::resolvePath($path); if (is_link($path)) { $path = realpath($path); } $walk = array(); $parts = explode(DIRECTORY_SEPARATOR, $path); foreach ($parts as $k => $part) { if (!strlen($part)) { unset($parts[$k]); } } do { if (phutil_is_windows()) { $walk[] = implode(DIRECTORY_SEPARATOR, $parts); } else { $walk[] = DIRECTORY_SEPARATOR.implode(DIRECTORY_SEPARATOR, $parts); } if (empty($parts)) { break; } array_pop($parts); } while (true); return $walk; } /* -( Paths )-------------------------------------------------------------- */ /** * Canonicalize a path by resolving it relative to some directory (by * default PWD), following parent symlinks and removing artifacts. If the * path is itself a symlink it is left unresolved. * * @param string Path, absolute or relative to PWD. * @return string Canonical, absolute path. * * @task path */ public static function resolvePath($path, $relative_to = null) { if (phutil_is_windows()) { $is_absolute = preg_match('/^[A-Za-z]+:/', $path); } else { $is_absolute = !strncmp($path, DIRECTORY_SEPARATOR, 1); } if (!$is_absolute) { if (!$relative_to) { $relative_to = getcwd(); } $path = $relative_to.DIRECTORY_SEPARATOR.$path; } if (is_link($path)) { $parent_realpath = realpath(dirname($path)); if ($parent_realpath !== false) { return $parent_realpath.DIRECTORY_SEPARATOR.basename($path); } } $realpath = realpath($path); if ($realpath !== false) { return $realpath; } // This won't work if the file doesn't exist or is on an unreadable mount // or something crazy like that. Try to resolve a parent so we at least // cover the nonexistent file case. $parts = explode(DIRECTORY_SEPARATOR, trim($path, DIRECTORY_SEPARATOR)); while (end($parts) !== false) { array_pop($parts); if (phutil_is_windows()) { $attempt = implode(DIRECTORY_SEPARATOR, $parts); } else { $attempt = DIRECTORY_SEPARATOR.implode(DIRECTORY_SEPARATOR, $parts); } $realpath = realpath($attempt); if ($realpath !== false) { $path = $realpath.substr($path, strlen($attempt)); break; } } return $path; } /** * Test whether a path is descendant from some root path after resolving all * symlinks and removing artifacts. Both paths must exists for the relation * to obtain. A path is always a descendant of itself as long as it exists. * * @param string Child path, absolute or relative to PWD. * @param string Root path, absolute or relative to PWD. * @return bool True if resolved child path is in fact a descendant of * resolved root path and both exist. * @task path */ public static function isDescendant($path, $root) { try { self::assertExists($path); self::assertExists($root); } catch (FilesystemException $e) { return false; } $fs = new FileList(array($root)); return $fs->contains($path); } /** * Convert a canonical path to its most human-readable format. It is * guaranteed that you can use resolvePath() to restore a path to its * canonical format. * * @param string Path, absolute or relative to PWD. * @param string Optionally, working directory to make files readable * relative to. * @return string Human-readable path. * * @task path */ public static function readablePath($path, $pwd = null) { if ($pwd === null) { $pwd = getcwd(); } foreach (array($pwd, self::resolvePath($pwd)) as $parent) { $parent = rtrim($parent, DIRECTORY_SEPARATOR).DIRECTORY_SEPARATOR; $len = strlen($parent); if (!strncmp($parent, $path, $len)) { $path = substr($path, $len); return $path; } } return $path; } /** * Determine whether or not a path exists in the filesystem. This differs from * file_exists() in that it returns true for symlinks. This method does not * attempt to resolve paths before testing them. * * @param string Test for the existence of this path. * @return bool True if the path exists in the filesystem. * @task path */ public static function pathExists($path) { return file_exists($path) || is_link($path); } /** * Determine if an executable binary (like `git` or `svn`) exists within * the configured `$PATH`. * * @param string Binary name, like `'git'` or `'svn'`. * @return bool True if the binary exists and is executable. * @task exec */ public static function binaryExists($binary) { return self::resolveBinary($binary) !== null; } /** * Locates the full path that an executable binary (like `git` or `svn`) is at * the configured `$PATH`. * * @param string Binary name, like `'git'` or `'svn'`. * @return string The full binary path if it is present, or null. * @task exec */ public static function resolveBinary($binary) { if (phutil_is_windows()) { list($err, $stdout) = exec_manual('where %s', $binary); $stdout = phutil_split_lines($stdout); if (!$stdout) { return null; } $stdout = head($stdout); } else { list($err, $stdout) = exec_manual('which %s', $binary); } return $err === 0 ? trim($stdout) : null; } /** * Determine if two paths are equivalent by resolving symlinks. This is * different from resolving both paths and comparing them because * resolvePath() only resolves symlinks in parent directories, not the * path itself. * * @param string First path to test for equivalence. * @param string Second path to test for equivalence. * @return bool True if both paths are equivalent, i.e. reference the same * entity in the filesystem. * @task path */ public static function pathsAreEquivalent($u, $v) { $u = Filesystem::resolvePath($u); $v = Filesystem::resolvePath($v); $real_u = realpath($u); $real_v = realpath($v); if ($real_u) { $u = $real_u; } if ($real_v) { $v = $real_v; } return ($u == $v); } /* -( Assert )------------------------------------------------------------- */ /** * Assert that something (e.g., a file, directory, or symlink) exists at a * specified location. * * @param string Assert that this path exists. * @return void * * @task assert */ public static function assertExists($path) { if (!self::pathExists($path)) { throw new FilesystemException( $path, "Filesystem entity `{$path}' does not exist."); } } /** * Assert that nothing exists at a specified location. * * @param string Assert that this path does not exist. * @return void * * @task assert */ public static function assertNotExists($path) { if (file_exists($path) || is_link($path)) { throw new FilesystemException( $path, "Path `{$path}' already exists!"); } } /** * Assert that a path represents a file, strictly (i.e., not a directory). * * @param string Assert that this path is a file. * @return void * * @task assert */ public static function assertIsFile($path) { if (!is_file($path)) { throw new FilesystemException( $path, "Requested path `{$path}' is not a file."); } } /** * Assert that a path represents a directory, strictly (i.e., not a file). * * @param string Assert that this path is a directory. * @return void * * @task assert */ public static function assertIsDirectory($path) { if (!is_dir($path)) { throw new FilesystemException( $path, "Requested path `{$path}' is not a directory."); } } /** * Assert that a file or directory exists and is writable. * * @param string Assert that this path is writable. * @return void * * @task assert */ public static function assertWritable($path) { if (!is_writable($path)) { throw new FilesystemException( $path, "Requested path `{$path}' is not writable."); } } /** * Assert that a file or directory exists and is readable. * * @param string Assert that this path is readable. * @return void * * @task assert */ public static function assertReadable($path) { if (!is_readable($path)) { throw new FilesystemException( $path, "Path `{$path}' is not readable."); } } } - diff --git a/src/future/aws/PhutilAWSException.php b/src/future/aws/PhutilAWSException.php index 900405f..b8b7aff 100644 --- a/src/future/aws/PhutilAWSException.php +++ b/src/future/aws/PhutilAWSException.php @@ -1,49 +1,48 @@ httpStatus = $http_status; $this->requestID = idx($params, 'RequestID'); $this->params = $params; $desc = array(); $desc[] = 'AWS Request Failed'; $desc[] = 'HTTP Status Code: '.$http_status; if ($this->requestID) { $desc[] = 'AWS Request ID: '.$this->requestID; $errors = idx($params, 'Errors'); if ($errors) { $desc[] = 'AWS Errors:'; foreach ($errors as $error) { list($code, $message) = $error; $desc[] = " - {$code}: {$message}\n"; } } } else { $desc[] = 'Response Body: '.idx($params, 'body'); } $desc = implode("\n", $desc); parent::__construct($desc); } public function getRequestID() { return $this->requestID; } public function getHTTPStatus() { return $this->httpStatus; } } - diff --git a/src/future/exec/CommandException.php b/src/future/exec/CommandException.php index e60e186..81cea8e 100644 --- a/src/future/exec/CommandException.php +++ b/src/future/exec/CommandException.php @@ -1,78 +1,77 @@ command = $command; $this->error = $error; $this->stdout = $stdout; $this->stderr = $stderr; $summary = array(); $summary[] = $this->summarize($message); $summary[] = "COMMAND"; $summary[] = $this->summarize($command); $summary[] = null; $summary[] = "STDOUT"; $summary[] = $this->summarize($stdout); $summary[] = null; $summary[] = "STDERR"; $summary[] = $this->summarize($stderr); $summary = implode("\n", $summary); parent::__construct($summary); } public function getCommand() { return $this->command; } public function getError() { return $this->error; } public function getStdout() { return $this->stdout; } public function getStderr() { return $this->stderr; } private function summarize($string) { if (!strlen($string)) { return '(empty)'; } $limit = 1000; $len = strlen($string); if ($len <= $limit) { return $string; } $cut = $len - $limit; $suffix = "... (".number_format($cut)." more bytes) ..."; if ($cut > strlen($suffix)) { return substr($string, 0, $limit).$suffix; } else { return $string; } } } - diff --git a/src/future/http/status/HTTPFutureResponseStatus.php b/src/future/http/status/HTTPFutureResponseStatus.php index f17a770..dc99279 100644 --- a/src/future/http/status/HTTPFutureResponseStatus.php +++ b/src/future/http/status/HTTPFutureResponseStatus.php @@ -1,44 +1,43 @@ statusCode = $status_code; $this->uri = (string)$uri; $type = $this->getErrorCodeType($status_code); $description = $this->getErrorCodeDescription($status_code); $uri_info = ''; if ($this->uri) { $uri_info = ' ('.$this->uri.')'; } $message = rtrim("[{$type}/{$status_code}]{$uri_info} {$description}"); parent::__construct($message); } final public function getStatusCode() { return $this->statusCode; } final public function getURI() { return $this->uri; } abstract public function isError(); abstract public function isTimeout(); abstract protected function getErrorCodeType($code); abstract protected function getErrorCodeDescription($code); } - diff --git a/src/future/http/status/HTTPFutureResponseStatusCURL.php b/src/future/http/status/HTTPFutureResponseStatusCURL.php index 54bc6be..0546c80 100644 --- a/src/future/http/status/HTTPFutureResponseStatusCURL.php +++ b/src/future/http/status/HTTPFutureResponseStatusCURL.php @@ -1,87 +1,86 @@ getStatusCode() == CURLE_OPERATION_TIMEOUTED); } protected function getErrorCodeDescription($code) { $constants = get_defined_constants(); $constant_name = null; foreach ($constants as $constant => $value) { if ($value == $code && preg_match('/^CURLE_/', $constant)) { $constant_name = '<'.$constant.'> '; break; } } $map = array( CURLE_COULDNT_RESOLVE_HOST => 'There was an error resolving the server hostname. Check that you are '. 'connected to the internet and that DNS is correctly configured. (Did '. 'you add the domain to `/etc/hosts` on some other machine, but not '. 'this one?)', CURLE_SSL_CACERT => 'There was an error verifying the SSL Certificate Authority while '. 'negotiating the SSL connection. This usually indicates that you are '. 'using a self-signed certificate but have not added your CA to the '. 'CA bundle. See instructions in "libphutil/resources/ssl/README".', // Apparently there's no error constant for this? In cURL it's // CURLE_SSL_CACERT_BADFILE but there's no corresponding constant in // PHP. 77 => 'The SSL CA Bundles that we tried to use could not be read or are '. 'not formatted correctly.', CURLE_SSL_CONNECT_ERROR => 'There was an error negotiating the SSL connection. This usually '. 'indicates that the remote host has a bad SSL certificate, or your '. 'local host has some sort of SSL misconfiguration which prevents it '. 'from accepting the CA. If you are using a self-signed certificate, '. 'see instructions in "libphutil/resources/ssl/README".', CURLE_OPERATION_TIMEOUTED => 'The request took too long to complete.', CURLE_SSL_PEER_CERTIFICATE => 'There was an error verifying the SSL connection. This usually '. 'indicates that the remote host has an SSL certificate for a '. 'different domain name than you are connecting with. Make sure the '. 'certificate you have installed is signed for the correct domain.', ); $default_message = "The cURL library raised an error while making a request. You may be ". "able to find more information about this error (error code: {$code}) ". "on the cURL site: http://curl.haxx.se/libcurl/c/libcurl-errors.html#". preg_replace('/[^A-Z]/', '', $constant_name); $detailed_message = idx($map, $code, $default_message); return $constant_name.$detailed_message; } } - diff --git a/src/future/http/status/HTTPFutureResponseStatusHTTP.php b/src/future/http/status/HTTPFutureResponseStatusHTTP.php index 43f61e2..96468e1 100644 --- a/src/future/http/status/HTTPFutureResponseStatusHTTP.php +++ b/src/future/http/status/HTTPFutureResponseStatusHTTP.php @@ -1,66 +1,65 @@ 512) { $excerpt = substr($body, 0, 512).'...'; } else { $excerpt = $body; } $content_type = BaseHTTPFuture::getHeader($headers, 'Content-Type'); $match = null; if (preg_match('/;\s*charset=([^;]+)/', $content_type, $match)) { $encoding = trim($match[1], "\"'"); try { $excerpt = phutil_utf8_convert($excerpt, 'UTF-8', $encoding); } catch (Exception $ex) { } } $this->excerpt = phutil_utf8ize($excerpt); $this->expect = $expect; parent::__construct($status_code); } protected function getErrorCodeType($code) { return 'HTTP'; } public function isError() { if ($this->expect === null) { return ($this->getStatusCode() < 200) || ($this->getStatusCode() > 299); } return !in_array($this->getStatusCode(), $this->expect, true); } public function isTimeout() { return false; } protected function getErrorCodeDescription($code) { static $map = array( 404 => 'Not Found', 500 => 'Internal Server Error', ); return idx($map, $code)."\n".$this->excerpt."\n"; } } - diff --git a/src/future/http/status/HTTPFutureResponseStatusParse.php b/src/future/http/status/HTTPFutureResponseStatusParse.php index a1f9ef7..5bdfe8c 100644 --- a/src/future/http/status/HTTPFutureResponseStatusParse.php +++ b/src/future/http/status/HTTPFutureResponseStatusParse.php @@ -1,34 +1,33 @@ rawResponse = $raw_response; parent::__construct($code); } protected function getErrorCodeType($code) { return 'Parse'; } public function isError() { return true; } public function isTimeout() { return false; } protected function getErrorCodeDescription($code) { return "The remote host returned something other than an HTTP response: ". $this->rawResponse; } } - diff --git a/src/future/http/status/HTTPFutureResponseStatusTransport.php b/src/future/http/status/HTTPFutureResponseStatusTransport.php index 1e1bd49..5fa9513 100644 --- a/src/future/http/status/HTTPFutureResponseStatusTransport.php +++ b/src/future/http/status/HTTPFutureResponseStatusTransport.php @@ -1,47 +1,46 @@ getStatusCode() == self::ERROR_TIMEOUT); } protected function getErrorCodeDescription($code) { $map = array( self::ERROR_TIMEOUT => 'The request took too long to complete.', self::ERROR_CONNECTION_ABORTED => 'The remote host closed the connection before the request completed.', self::ERROR_CONNECTION_REFUSED => 'The remote host refused the connection. This usually means the '. 'host is not running an HTTP server, or the network is blocking '. 'connections from this machine. Verify you can connect to the '. 'remote host from this host.', self::ERROR_CONNECTION_FAILED => 'Connection could not be initiated. This usually indicates a DNS '. 'problem: verify the domain name is correct, that you can '. 'perform a DNS lookup for it from this machine. (Did you add the '. 'domain to `/etc/hosts` on some other machine, but not this one?) '. 'This might also indicate that you specified the wrong port.', ); return idx($map, $code); } } - diff --git a/src/markup/engine/remarkup/blockrule/PhutilRemarkupEngineRemarkupHeaderBlockRule.php b/src/markup/engine/remarkup/blockrule/PhutilRemarkupEngineRemarkupHeaderBlockRule.php index 7002cff..3c9dc81 100644 --- a/src/markup/engine/remarkup/blockrule/PhutilRemarkupEngineRemarkupHeaderBlockRule.php +++ b/src/markup/engine/remarkup/blockrule/PhutilRemarkupEngineRemarkupHeaderBlockRule.php @@ -1,164 +1,164 @@ 1) { $level = ($lines[1][0] == '=') ? 1 : 2; $text = trim($lines[0]); } else { $level = 0; for ($ii = 0; $ii < min(5, strlen($text)); $ii++) { if ($text[$ii] == '=') { ++$level; } else { break; } } $text = trim($text, ' ='); } $engine = $this->getEngine(); if ($engine->isTextMode()) { $char = ($level == 1) ? '=' : '-'; return $text."\n".str_repeat($char, phutil_utf8_strlen($text)); } $use_anchors = $engine->getConfig('header.generate-toc'); $anchor = null; if ($use_anchors) { $anchor = $this->generateAnchor($level, $text); } $text = phutil_tag( 'h'.($level + 1), array(), array($anchor, $this->applyRules($text))); return $text; } private function generateAnchor($level, $text) { $anchor = strtolower($text); $anchor = preg_replace('/[^a-z0-9]/', '-', $anchor); $anchor = preg_replace('/--+/', '-', $anchor); $anchor = trim($anchor, '-'); $anchor = substr($anchor, 0, 24); $anchor = trim($anchor, '-'); $base = $anchor; $key = self::KEY_HEADER_TOC; $engine = $this->getEngine(); $anchors = $engine->getTextMetadata($key, array()); $suffix = 1; while (!strlen($anchor) || isset($anchors[$anchor])) { $anchor = $base.'-'.$suffix; $anchor = trim($anchor, '-'); $suffix++; } // When a document contains a link inside a header, like this: // // = [[ http://wwww.example.com/ | example ]] = // // ...we want to generate a TOC entry with just "example", but link the // header itself. We push the 'toc' state so all the link rules generate // just names. $engine->pushState('toc'); $text = $this->applyRules($text); $text = $engine->restoreText($text); $anchors[$anchor] = array($level, $text); $engine->popState('toc'); $engine->setTextMetadata($key, $anchors); return phutil_tag( 'a', array( 'name' => $anchor, ), ''); } public static function renderTableOfContents(PhutilRemarkupEngine $engine) { $key = self::KEY_HEADER_TOC; $anchors = $engine->getTextMetadata($key, array()); if (count($anchors) < 2) { // Don't generate a TOC if there are no headers, or if there's only // one header (since such a TOC would be silly). return null; } $depth = 0; $toc = array(); foreach ($anchors as $anchor => $info) { list($level, $name) = $info; while ($depth < $level) { $toc[] = hsprintf(''); $depth--; } $toc[] = phutil_tag( 'li', array(), phutil_tag( 'a', array( 'href' => '#'.$anchor, ), $name)); } while ($depth > 0) { $toc[] = hsprintf(''); $depth--; } return phutil_implode_html("\n", $toc); } } diff --git a/src/parser/PhutilEmailAddress.php b/src/parser/PhutilEmailAddress.php index bc99833..268e699 100644 --- a/src/parser/PhutilEmailAddress.php +++ b/src/parser/PhutilEmailAddress.php @@ -1,89 +1,88 @@ $/', $email_address, $matches)) { $display_name = trim($matches[1], '\'" '); if (strpos($matches[2], '@') !== false) { list($local_part, $domain_name) = explode('@', $matches[2], 2); } else { $local_part = $matches[2]; $domain_name = null; } } else if (preg_match('/^(.*)@(.*)$/', $email_address, $matches)) { $display_name = null; $local_part = $matches[1]; $domain_name = $matches[2]; } else { $display_name = null; $local_part = $email_address; $domain_name = null; } $this->displayName = $display_name; $this->localPart = $local_part; $this->domainName = $domain_name; } public function __toString() { $address = $this->getAddress(); if ($this->displayName) { return $this->displayName.' <'.$address.'>'; } else { return $address; } } public function setDisplayName($display_name) { $this->displayName = $display_name; return $this; } public function getDisplayName() { return $this->displayName; } public function setLocalPart($local_part) { $this->localPart = $local_part; return $this; } public function getLocalPart() { return $this->localPart; } public function setDomainName($domain_name) { $this->domainName = $domain_name; return $this; } public function getDomainName() { return $this->domainName; } public function getAddress() { $address = $this->localPart; if ($this->domainName) { $address .= '@'.$this->domainName; } return $address; } } - diff --git a/src/parser/PhutilGitURI.php b/src/parser/PhutilGitURI.php index e847a84..85b861f 100644 --- a/src/parser/PhutilGitURI.php +++ b/src/parser/PhutilGitURI.php @@ -1,89 +1,88 @@ parseURI($uri); if ($parts) { $this->user = $parts[1]; $this->domain = $parts[2]; $this->path = $parts[3]; } } private static function parseURI($uri) { $user = '(?:([^@]+)@)?'; $domain = '([^:]+)'; $path = ':(.*)'; $regexp = '/^'.$user.$domain.$path.'$/'; $matches = null; $ok = preg_match($regexp, $uri, $matches); if ($ok) { return array_pad($matches, 4, ''); } return null; } public function __toString() { $user = null; if ($this->user) { $user = $this->user.'@'; } $domain = $this->domain; $path = $this->path; return $user.$domain.':'.$path; } public function setDomain($domain) { $this->domain = $domain; return $this; } public function getDomain() { return $this->domain; } public function setPath($path) { $this->path = $path; return $this; } public function getPath() { return $this->path; } public function setUser($user) { $this->user = $user; return $this; } public function getUser() { return $this->user; } } - diff --git a/src/parser/PhutilURI.php b/src/parser/PhutilURI.php index 8cc4ceb..8746a8e 100644 --- a/src/parser/PhutilURI.php +++ b/src/parser/PhutilURI.php @@ -1,175 +1,174 @@ protocol = idx($parts, 'scheme', ''); $this->user = rawurldecode(idx($parts, 'user', '')); $this->pass = rawurldecode(idx($parts, 'pass', '')); $this->domain = idx($parts, 'host', ''); $this->port = (string)idx($parts, 'port', ''); $this->path = idx($parts, 'path', ''); $query = idx($parts, 'query'); if ($query) { $this->query = id(new PhutilQueryStringParser())->parseQueryString( $query); } $this->fragment = idx($parts, 'fragment', ''); } public function __toString() { $prefix = null; if ($this->protocol || $this->domain || $this->port) { $protocol = nonempty($this->protocol, 'http'); $auth = ''; if (strlen($this->user) && strlen($this->pass)) { $auth = phutil_escape_uri($this->user).':'. phutil_escape_uri($this->pass).'@'; } else if (strlen($this->user)) { $auth = phutil_escape_uri($this->user).'@'; } $prefix = $protocol.'://'.$auth.$this->domain; if ($this->port) { $prefix .= ':'.$this->port; } } if ($this->query) { $query = '?'.http_build_query($this->query); } else { $query = null; } if (strlen($this->getFragment())) { $fragment = '#'.$this->getFragment(); } else { $fragment = null; } return $prefix.$this->getPath().$query.$fragment; } public function setQueryParam($key, $value) { if ($value === null) { unset($this->query[$key]); } else { $this->query[$key] = $value; } return $this; } public function setQueryParams(array $params) { $this->query = $params; return $this; } public function getQueryParams() { return $this->query; } public function setProtocol($protocol) { $this->protocol = $protocol; return $this; } public function getProtocol() { return $this->protocol; } public function setDomain($domain) { $this->domain = $domain; return $this; } public function getDomain() { return $this->domain; } public function setPort($port) { $this->port = $port; return $this; } public function getPort() { return $this->port; } public function setPath($path) { if ($this->domain && strlen($path) && $path[0] !== '/') { $path = '/'.$path; } $this->path = $path; return $this; } public function getPath() { return $this->path; } public function setFragment($fragment) { $this->fragment = $fragment; return $this; } public function getFragment() { return $this->fragment; } public function setUser($user) { $this->user = $user; return $this; } public function getUser() { return $this->user; } public function setPass($pass) { $this->pass = $pass; return $this; } public function getPass() { return $this->pass; } public function alter($key, $value) { $altered = clone $this; $altered->setQueryParam($key, $value); return $altered; } } - diff --git a/src/utils/utf8.php b/src/utils/utf8.php index 96dd7fe..735f6a8 100644 --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -1,775 +1,774 @@ for // some discussion. Since the input limit is extremely low (less than 50KB on // my system), do this check very very slowly in PHP instead. $len = strlen($string); for ($ii = 0; $ii < $len; $ii++) { $chr = ord($string[$ii]); if ($chr >= 0x01 && $chr <= 0x7F) { continue; } else if ($chr >= 0xC2 && $chr <= 0xDF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } return false; } else if ($chr > 0xE0 && $chr <= 0xEF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } else if ($chr == 0xE0) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are // "valid", but not minimal representations, and MySQL rejects them. We're // special casing this part of the range. if ($chr >= 0xA0 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } return false; } return true; } /** * Determine if a string is valid UTF-8. * * @param string Some string which may or may not be valid UTF-8. * @return bool True if the string is valid UTF-8. * @group utf8 */ function phutil_is_utf8($string) { if (function_exists('mb_check_encoding')) { // If mbstring is available, this is significantly faster than using PHP // regexps. return mb_check_encoding($string, 'UTF-8'); } // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should // not. The MB version works correctly. $regex = "/^(". "[\x01-\x7F]+". "|([\xC2-\xDF][\x80-\xBF])". "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])". "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/"; return (bool)preg_match($regex, $string); } /** * Find the character length of a UTF-8 string. * * @param string A valid utf-8 string. * @return int The character length of the string. * @group utf8 */ function phutil_utf8_strlen($string) { return strlen(utf8_decode($string)); } /** * Find the console display length of a UTF-8 string. This may differ from the * character length of the string if it contains double-width characters, like * many Chinese characters. * * This method is based on a C implementation here, which is based on the IEEE * standards. The source has more discussion and addresses more considerations * than this implementation does. * * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * * NOTE: We currently do not handle combining characters correctly. * * NOTE: We currently assume width 1 for East-Asian ambiguous characters. * * NOTE: This function is VERY slow. * * @param string A valid UTF-8 string. * @return int The console display length of the string. * @group utf8 */ function phutil_utf8_console_strlen($string) { $string_v = phutil_utf8v_codepoints($string); $len = 0; foreach ($string_v as $c) { if ($c == 0) { continue; } $len += 1 + ($c >= 0x1100 && ($c <= 0x115f || /* Hangul Jamo init. consonants */ $c == 0x2329 || $c == 0x232a || ($c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f) || /* CJK ... Yi */ ($c >= 0xac00 && $c <= 0xd7a3) || /* Hangul Syllables */ ($c >= 0xf900 && $c <= 0xfaff) || /* CJK Compatibility Ideographs */ ($c >= 0xfe10 && $c <= 0xfe19) || /* Vertical forms */ ($c >= 0xfe30 && $c <= 0xfe6f) || /* CJK Compatibility Forms */ ($c >= 0xff00 && $c <= 0xff60) || /* Fullwidth Forms */ ($c >= 0xffe0 && $c <= 0xffe6) || ($c >= 0x20000 && $c <= 0x2fffd) || ($c >= 0x30000 && $c <= 0x3fffd))); } return $len; } /** * Split a UTF-8 string into an array of characters. Combining characters are * also split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. * @group utf8 */ function phutil_utf8v($string) { $res = array(); $len = strlen($string); $ii = 0; while ($ii < $len) { $byte = $string[$ii]; if ($byte <= "\x7F") { $res[] = $byte; $ii += 1; continue; } else if ($byte < "\xC0") { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } else if ($byte <= "\xDF") { $seq_len = 2; } else if ($byte <= "\xEF") { $seq_len = 3; } else if ($byte <= "\xF7") { $seq_len = 4; } else if ($byte <= "\xFB") { $seq_len = 5; } else if ($byte <= "\xFD") { $seq_len = 6; } else { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } if ($ii + $seq_len > $len) { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } for ($jj = 1; $jj < $seq_len; ++$jj) { if ($string[$ii + $jj] >= "\xC0") { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } } $res[] = substr($string, $ii, $seq_len); $ii += $seq_len; } return $res; } /** * Split a UTF-8 string into an array of codepoints (as integers). * * @param string A valid UTF-8 string. * @return list A list of codepoints, as integers. * @group utf8 */ function phutil_utf8v_codepoints($string) { $str_v = phutil_utf8v($string); foreach ($str_v as $key => $char) { $c = ord($char[0]); $v = 0; if (($c & 0x80) == 0) { $v = $c; } else if (($c & 0xE0) == 0xC0) { $v = (($c & 0x1F) << 6) + ((ord($char[1]) & 0x3F)); } else if (($c & 0xF0) == 0xE0) { $v = (($c & 0x0F) << 12) + ((ord($char[1]) & 0x3f) << 6) + ((ord($char[2]) & 0x3f)); } else if (($c & 0xF8) == 0xF0) { $v = (($c & 0x07) << 18) + ((ord($char[1]) & 0x3F) << 12) + ((ord($char[2]) & 0x3F) << 6) + ((ord($char[3]) & 0x3f)); } else if (($c & 0xFC) == 0xF8) { $v = (($c & 0x03) << 24) + ((ord($char[1]) & 0x3F) << 18) + ((ord($char[2]) & 0x3F) << 12) + ((ord($char[3]) & 0x3f) << 6) + ((ord($char[4]) & 0x3f)); } else if (($c & 0xFE) == 0xFC) { $v = (($c & 0x01) << 30) + ((ord($char[1]) & 0x3F) << 24) + ((ord($char[2]) & 0x3F) << 18) + ((ord($char[3]) & 0x3f) << 12) + ((ord($char[4]) & 0x3f) << 6) + ((ord($char[5]) & 0x3f)); } $str_v[$key] = $v; } return $str_v; } /** * Shorten a string to provide a summary, respecting UTF-8 characters. This * function attempts to truncate strings at word boundaries. * * NOTE: This function makes a best effort to apply some reasonable rules but * will not work well for the full range of unicode languages. * * @param string UTF-8 string to shorten. * @param int Maximum length of the result. * @param string If the string is shortened, add this at the end. Defaults to * horizontal ellipsis. * @return string A string with no more than the specified character length. * * @group utf8 */ function phutil_utf8_shorten($string, $length, $terminal = "\xE2\x80\xA6") { // If the string has fewer bytes than the minimum length, we can return // it unmodified without doing any heavy lifting. if (strlen($string) <= $length) { return $string; } $string_v = phutil_utf8v_combined($string); $string_len = count($string_v); if ($string_len <= $length) { // If the string is already shorter than the requested length, simply return // it unmodified. return $string; } // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin langauges. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array( ' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true, ); // If we encounter these, shorten to this character exactly without appending // the terminal. static $stop_characters = array( '.' => true, '!' => true, '?' => true, ); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; $terminal_len = phutil_utf8_strlen($terminal); // If we do a word break with a terminal, we have to look beyond at least the // number of characters in the terminal. If the terminal is longer than the // required length, we'll skip this whole block and return it on its own $terminal_area = $length - min($length, $terminal_len); for ($ii = $length; $ii >= 0; $ii--) { $c = $string_v[$ii]; if (isset($break_characters[$c]) && ($ii <= $terminal_area)) { $word_boundary = $ii; } else if (isset($stop_characters[$c]) && ($ii < $length)) { $stop_boundary = $ii + 1; break; } else { if ($word_boundary !== null) { break; } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_v, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. if ($word_boundary === null || $word_boundary === 0) { $word_boundary = $terminal_area; } $string_part = array_slice($string_v, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part.$terminal; } /** * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities. * * @param string An HTML string with tags and entities. * @return list List of hard-wrapped lines. * @group utf8 */ function phutil_utf8_hard_wrap_html($string, $width) { $break_here = array(); // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($string); $len = count($vector); $char_pos = 0; for ($ii = 0; $ii < $len; ++$ii) { // An ampersand indicates an HTML entity; consume the whole thing (until // ";") but treat it all as one character. if ($vector[$ii] == '&') { do { ++$ii; } while ($vector[$ii] != ';'); ++$char_pos; // An "<" indicates an HTML tag, consume the whole thing but don't treat // it as a character. } else if ($vector[$ii] == '<') { do { ++$ii; } while ($vector[$ii] != '>'); } else { ++$char_pos; } // Keep track of where we need to break the string later. if ($char_pos == $width) { $break_here[$ii] = true; $char_pos = 0; } } $result = array(); $string = ''; foreach ($vector as $ii => $char) { $string .= $char; if (isset($break_here[$ii])) { $result[] = $string; $string = ''; } } if (strlen($string)) { $result[] = $string; } return $result; } /** * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entitites * * @param string A non HTML string * @param int Width of the hard-wrapped lines * @return list List of hard-wrapped lines. * @group utf8 */ function phutil_utf8_hard_wrap($string, $width) { $result = array(); $lines = phutil_split_lines($string, $retain_endings = false); foreach ($lines as $line) { // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($line); $len = count($vector); $buffer = ''; for ($ii = 1; $ii <= $len; ++$ii) { $buffer .= $vector[$ii - 1]; if (($ii % $width) === 0) { $result[] = $buffer; $buffer = ''; } } if (strlen($buffer)) { $result[] = $buffer; } } return $result; } /** * Convert a string from one encoding (like ISO-8859-1) to another encoding * (like UTF-8). * * This is primarily a thin wrapper around `mb_convert_encoding()` which checks * you have the extension installed, since we try to require the extension * only if you actually need it (i.e., you want to work with encodings other * than UTF-8). * * NOTE: This function assumes that the input is in the given source encoding. * If it is not, it may not output in the specified target encoding. If you * need to perform a hard conversion to UTF-8, use this function in conjunction * with @{function:phutil_utf8ize}. We can detect failures caused by invalid * encoding names, but `mb_convert_encoding()` fails silently if the * encoding name identifies a real encoding but the string is not actually * encoded with that encoding. * * @param string String to re-encode. * @param string Target encoding name, like "UTF-8". * @param string Source endocing name, like "ISO-8859-1". * @return string Input string, with converted character encoding. * * @group utf8 * * @phutil-external-symbol function mb_convert_encoding */ function phutil_utf8_convert($string, $to_encoding, $from_encoding) { if (!$from_encoding) { throw new InvalidArgumentException( "Attempting to convert a string encoding, but no source encoding ". "was provided. Explicitly provide the source encoding."); } if (!$to_encoding) { throw new InvalidArgumentException( "Attempting to convert a string encoding, but no target encoding ". "was provided. Explicitly provide the target encoding."); } // Normalize encoding names so we can no-op the very common case of UTF8 // to UTF8 (or any other conversion where both encodings are identical). $to_upper = strtoupper(str_replace('-', '', $to_encoding)); $from_upper = strtoupper(str_replace('-', '', $from_encoding)); if ($from_upper == $to_upper) { return $string; } if (!function_exists('mb_convert_encoding')) { throw new Exception( "Attempting to convert a string encoding from '{$from_encoding}' ". "to '{$to_encoding}', but the 'mbstring' PHP extension is not ". "available. Install mbstring to work with encodings other than ". "UTF-8."); } $result = @mb_convert_encoding($string, $to_encoding, $from_encoding); if ($result === false) { $message = error_get_last(); if ($message) { $message = idx($message, 'message', 'Unknown error.'); } throw new Exception( "String conversion from encoding '{$from_encoding}' to encoding ". "'{$to_encoding}' failed: {$message}"); } return $result; } /** * Convert a string to title case in a UTF8-aware way. This function doesn't * necessarily do a great job, but the builtin implementation of ucwords() can * completely destroy inputs, so it just has to be better than that. Similar to * @{function:ucwords}. * * @param string UTF-8 input string. * @return string Input, in some semblance of title case. * * @group utf8 */ function phutil_utf8_ucwords($str) { // NOTE: mb_convert_case() discards uppercase letters in words when converting // to title case. For example, it will convert "AAA" into "Aaa", which is // undesirable. $v = phutil_utf8v($str); $result = ''; $last = null; $ord_a = ord('a'); $ord_z = ord('z'); foreach ($v as $c) { $convert = false; if ($last === null || $last === ' ') { $o = ord($c[0]); if ($o >= $ord_a && $o <= $ord_z) { $convert = true; } } if ($convert) { $result .= phutil_utf8_strtoupper($c); } else { $result .= $c; } $last = $c; } return $result; } /** * Convert a string to lower case in a UTF8-aware way. Similar to * @{function:strtolower}. * * @param string UTF-8 input string. * @return string Input, in some semblance of lower case. * * @group utf8 * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtolower($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('A', 'Z'), range('a', 'z')); } return phutil_utf8_strtr($str, $map); } /** * Convert a string to upper case in a UTF8-aware way. Similar to * @{function:strtoupper}. * * @param string UTF-8 input string. * @return string Input, in some semblance of upper case. * * @group utf8 * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtoupper($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('a', 'z'), range('A', 'Z')); } return phutil_utf8_strtr($str, $map); } /** * Replace characters in a string in a UTF-aware way. Similar to * @{function:strtr}. * * @param string UTF-8 input string. * @param map Map of characters to replace. * @return string Input with translated characters. * * @group utf8 */ function phutil_utf8_strtr($str, array $map) { $v = phutil_utf8v($str); $result = ''; foreach ($v as $c) { if (isset($map[$c])) { $result .= $map[$c]; } else { $result .= $c; } } return $result; } /** * Determine if a given unicode character is a combining character or not. * * @param string A single unicode character. * @return boolean True or false. * * @group utf8 */ function phutil_utf8_is_combining_character($character) { $components = phutil_utf8v_codepoints($character); // Combining Diacritical Marks (0300 - 036F). // Combining Diacritical Marks Supplement (1DC0 - 1DFF). // Combining Diacritical Marks for Symbols (20D0 - 20FF). // Combining Half Marks (FE20 - FE2F). foreach ($components as $codepoint) { if ($codepoint >= 0x0300 && $codepoint <= 0x036F || $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF || $codepoint >= 0x20D0 && $codepoint <= 0x20FF || $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) { return true; } } return false; } /** * Split a UTF-8 string into an array of characters. Combining characters * are not split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. * * @group utf8 */ function phutil_utf8v_combined($string) { $components = phutil_utf8v($string); $array_length = count($components); // If the first character in the string is a combining character, // prepend a space to the string. if ( $array_length > 0 && phutil_utf8_is_combining_character($components[0])) { $string = " ".$string; $components = phutil_utf8v($string); $array_length++; } for ($index = 1; $index < $array_length; $index++) { if (phutil_utf8_is_combining_character($components[$index])) { $components[$index - 1] = $components[$index - 1].$components[$index]; unset($components[$index]); $components = array_values($components); $index --; $array_length = count($components); } } return $components; } - diff --git a/src/xsprintf/PhutilQsprintfInterface.php b/src/xsprintf/PhutilQsprintfInterface.php index 8680f8d..9dd03f6 100644 --- a/src/xsprintf/PhutilQsprintfInterface.php +++ b/src/xsprintf/PhutilQsprintfInterface.php @@ -1,13 +1,12 @@ $value) { $hpp .= "#define {$node} {$value}\n"; } file_put_contents('node_names.hpp', $hpp); echo "Wrote C++ definition.\n"; $at = '@'; $php = " $value) { $php .= " {$value} => '{$node}',\n"; } $php .= " );\n"; $php .= "}\n"; file_put_contents('parser_nodes.php', $php); echo "Wrote PHP definition.\n"; - - - diff --git a/support/xhpast/xhpast.cpp b/support/xhpast/xhpast.cpp index 32dd231..b1dccdf 100644 --- a/support/xhpast/xhpast.cpp +++ b/support/xhpast/xhpast.cpp @@ -1,129 +1,128 @@ #include "ast.hpp" #include #include #include #include #include using namespace std; int xhpastparse(void*, xhpast::Node **); int xhpast_process(std::string &in); void print_node(xhpast::Node *node); int main(int argc, char* argv[]) { vector files; if (argc != 1) { //coupling: modify also libphutil/src/parser/xhpast/bin/xhpast_parse.php cout << "xhpast version 5.5.8/1d\n"; return 0; } ifstream inputFile; istream *inputStream; inputStream = &cin; std::stringbuf sb; *inputStream >> noskipws >> &sb; std::string buffer = sb.str(); inputFile.close(); return xhpast_process(buffer); } int xhpast_process(std::string &in) { char *buffer; in.reserve(in.size() + 1); buffer = const_cast(in.c_str()); buffer[in.size() + 1] = 0; // need double NULL for scan_buffer void* scanner; yy_extra_type extra; extra.idx_expr = true;//flags.idx_expr; extra.include_debug = true;//flags.include_debug; extra.insert_token = 0;//flags.eval ? T_OPEN_TAG_FAKE : 0; extra.short_tags = true;//flags.short_tags; extra.asp_tags = false;//flags.asp_tags; xhpast::Node *root = NULL; xhpastlex_init(&scanner); xhpastset_extra(&extra, scanner); xhpast_scan_buffer(buffer, in.size() + 2, scanner); xhpastparse(scanner, &root); xhpastlex_destroy(scanner); if (extra.terminated) { fprintf( stderr, "XHPAST Parse Error: %s on line %d\n", extra.error.c_str(), (int)extra.lineno); return 1; } printf("{"); printf("\"tree\":"); if (root) { // Extend the right token for the root node to the end of the concrete // token stream. This ensure all tokens appear in the tree. If we don't // do this and the file ends in tokens which don't go to the parser (like // comments and whitespace) they won't be represented in the tree. root->r_tok = (extra.token_list.size() - 1); print_node(root); } else { printf("null"); } printf(","); printf("\"stream\":"); printf("["); if (!extra.token_list.empty()) { for (xhpast::token_list_t::iterator ii = extra.token_list.begin();;) { printf("[%d, %d]", (*ii)->type, (int)(*ii)->value.length()); if (++ii != extra.token_list.end()) { printf(","); } else { break; } } } printf("]"); printf("}\n"); return 0; } void print_node(xhpast::Node *node) { int l = -1; int r = -1; if (node->l_tok != -1) { l = node->l_tok; } if (l == -1) { printf("[%d]", node->type); } else { if (node->r_tok != -1) { r = node->r_tok; } printf("[%d, %d, %d", node->type, l, r); if (!node->children.empty()) { printf(", ["); for (xhpast::node_list_t::iterator ii = node->children.begin();;) { print_node(*ii); if (++ii != node->children.end()) { printf(","); } else { break; } } printf("]"); } printf("]"); } } -