Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91082040
TextDocument2.java
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Nov 7, 17:01
Size
18 KB
Mime Type
text/x-Algol68
Expires
Sat, Nov 9, 17:01 (2 d)
Engine
blob
Format
Raw Data
Handle
22192825
Attached To
R1473 warcbase
TextDocument2.java
View Options
/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package
org.warcbase.data
;
import
java.io.BufferedReader
;
import
java.io.FileNotFoundException
;
import
java.io.FileReader
;
import
java.io.IOException
;
import
java.io.InputStreamReader
;
import
java.io.OutputStream
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.text.ParseException
;
import
javax.servlet.ServletException
;
import
javax.servlet.http.HttpServletRequest
;
import
javax.servlet.http.HttpServletResponse
;
import
org.archive.wayback.ResultURIConverter
;
import
org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter
;
import
org.archive.wayback.core.Resource
;
import
org.archive.wayback.core.CaptureSearchResult
;
import
org.archive.wayback.core.CaptureSearchResults
;
import
org.archive.wayback.core.UIResults
;
import
org.archive.wayback.core.WaybackRequest
;
import
org.archive.wayback.replay.StringHttpServletResponseWrapper
;
import
org.archive.wayback.replay.TagMagix
;
import
org.archive.wayback.util.url.UrlOperations
;
/**
* Class which wraps functionality for converting a Resource(InputStream +
* HTTP headers) into a StringBuilder, performing several common URL
* resolution methods against that StringBuilder, inserting arbitrary Strings
* into the page, and then converting the page back to a byte array.
*
* @author brad
* @version $Date$, $Revision$
*/
public
class
TextDocument2
{
public
static
String
SERVER_PREFIX
=
"http://tibanna.umiacs.umd.edu:8080/"
;
// if documents are marked up before sending to clients, the data is
// decoded into a String in chunks. This is how big a chunk to decode with.
private
final
static
int
C_BUFFER_SIZE
=
4096
;
private
Resource
resource
=
null
;
private
CaptureSearchResult
result
=
null
;
private
ResultURIConverter
uriConverter
=
null
;
/**
* the internal StringBuilder
*/
public
static
StringBuilder
sb
=
null
;
private
String
charSet
=
null
;
private
byte
[]
resultBytes
=
null
;
/**
* @param resource
* @param result
* @param uriConverter
*/
public
TextDocument2
(
Resource
resource
,
CaptureSearchResult
result
,
ResultURIConverter
uriConverter
)
{
this
.
resource
=
resource
;
this
.
result
=
result
;
this
.
uriConverter
=
uriConverter
;
}
public
void
addBase
()
{
// TODO: get url from Resource instead of SearchResult?
String
pageUrl
=
result
.
getOriginalUrl
();
String
captureDate
=
result
.
getCaptureTimestamp
();
String
existingBaseHref
=
TagMagix
.
getBaseHref
(
sb
);
if
(
existingBaseHref
==
null
)
{
insertAtStartOfHead
(
"<base href=\""
+
pageUrl
+
"\" />"
);
}
else
{
pageUrl
=
existingBaseHref
;
}
}
/**
* Update URLs inside the page, so those URLs which must be correct at
* page load time resolve correctly to absolute URLs.
*
* This means ensuring there is a BASE HREF tag, adding one if missing,
* and then resolving:
* FRAME-SRC, META-URL, LINK-HREF, SCRIPT-SRC
* tag-attribute pairs against either the existing BASE-HREF, or the
* page's absolute URL if it was missing.
*/
public
void
resolvePageUrls
()
{
// TODO: get url from Resource instead of SearchResult?
String
pageUrl
=
result
.
getOriginalUrl
();
String
captureDate
=
result
.
getCaptureTimestamp
();
String
existingBaseHref
=
TagMagix
.
getBaseHref
(
sb
);
if
(
existingBaseHref
==
null
)
{
insertAtStartOfHead
(
"<base href=\""
+
pageUrl
+
"\" />"
);
}
else
{
pageUrl
=
existingBaseHref
;
}
String
markups
[][]
=
{
{
"FRAME"
,
"SRC"
},
{
"META"
,
"URL"
},
{
"LINK"
,
"HREF"
},
{
"SCRIPT"
,
"SRC"
},
{
TagMagix
.
ANY_TAGNAME
,
"background"
}
};
// TODO: The classic WM added a js_ to the datespec, so NotInArchives
// can return an valid javascript doc, and not cause Javascript errors.
for
(
String
tagAttr
[]
:
markups
)
{
TagMagix
.
markupTagREURIC
(
sb
,
uriConverter
,
captureDate
,
pageUrl
,
tagAttr
[
0
],
tagAttr
[
1
]);
}
TagMagix
.
markupCSSImports
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
TagMagix
.
markupStyleUrls
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
}
/**
* Update all URLs inside the page, so they resolve correctly to absolute
* URLs within the Wayback service.
*/
public
void
resolveAllPageUrls
()
{
// TODO: get url from Resource instead of SearchResult?
String
pageUrl
=
result
.
getOriginalUrl
();
String
captureDate
=
result
.
getCaptureTimestamp
();
String
existingBaseHref
=
TagMagix
.
getBaseHref
(
sb
);
if
(
existingBaseHref
!=
null
)
{
pageUrl
=
existingBaseHref
;
}
ResultURIConverter
ruc
=
new
SpecialResultURIConverter
(
uriConverter
);
// TODO: forms...?
String
markups
[][]
=
{
{
"FRAME"
,
"SRC"
},
{
"META"
,
"URL"
},
{
"LINK"
,
"HREF"
},
{
"SCRIPT"
,
"SRC"
},
{
"IMG"
,
"SRC"
},
{
"A"
,
"HREF"
},
{
"AREA"
,
"HREF"
},
{
"OBJECT"
,
"CODEBASE"
},
{
"OBJECT"
,
"CDATA"
},
{
"APPLET"
,
"CODEBASE"
},
{
"APPLET"
,
"ARCHIVE"
},
{
"EMBED"
,
"SRC"
},
{
"IFRAME"
,
"SRC"
},
{
TagMagix
.
ANY_TAGNAME
,
"background"
}
};
for
(
String
tagAttr
[]
:
markups
)
{
TagMagix
.
markupTagREURIC
(
sb
,
ruc
,
captureDate
,
pageUrl
,
tagAttr
[
0
],
tagAttr
[
1
]);
}
TagMagix
.
markupCSSImports
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
TagMagix
.
markupStyleUrls
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
}
public
void
resolveCSSUrls
()
{
// TODO: get url from Resource instead of SearchResult?
String
pageUrl
=
result
.
getOriginalUrl
();
String
captureDate
=
result
.
getCaptureTimestamp
();
TagMagix
.
markupCSSImports
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
}
public
void
resolveASXRefUrls
()
{
// TODO: get url from Resource instead of SearchResult?
String
pageUrl
=
result
.
getOriginalUrl
();
String
captureDate
=
result
.
getCaptureTimestamp
();
ResultURIConverter
ruc
=
new
MMSToHTTPResultURIConverter
(
uriConverter
);
TagMagix
.
markupTagREURIC
(
sb
,
ruc
,
captureDate
,
pageUrl
,
"REF"
,
"HREF"
);
}
public
void
stripHTML
()
{
String
stripped
=
sb
.
toString
().
replaceAll
(
"\\<.*?>"
,
""
);
sb
.
setLength
(
0
);
sb
.
append
(
stripped
);
}
/**
* @param charSet
* @throws IOException
*/
public
void
readFully
(
String
charSet
)
throws
IOException
{
this
.
charSet
=
charSet
;
int
recordLength
=
(
int
)
resource
.
getRecordLength
();
// convert bytes to characters for charset:
InputStreamReader
isr
=
new
InputStreamReader
(
resource
,
charSet
);
char
[]
cbuffer
=
new
char
[
C_BUFFER_SIZE
];
// slurp the whole thing into RAM:
sb
=
new
StringBuilder
(
recordLength
);
//Skip the UTF-8 BOM 0xFEFF
int
firstChar
=
isr
.
read
();
if
((
firstChar
!=
'\uFEFF'
)
&&
(
firstChar
!=
-
1
))
{
sb
.
append
((
char
)
firstChar
);
}
for
(
int
r
=
-
1
;
(
r
=
isr
.
read
(
cbuffer
,
0
,
C_BUFFER_SIZE
))
!=
-
1
;)
{
sb
.
append
(
cbuffer
,
0
,
r
);
}
}
/**
* Read bytes from input stream, using best-guess for character encoding
* @throws IOException
*/
public
void
readFully
()
throws
IOException
{
readFully
(
null
);
}
/**
* @return raw bytes contained in internal StringBuilder
* @throws UnsupportedEncodingException
*/
public
byte
[]
getBytes
()
throws
UnsupportedEncodingException
{
if
(
resultBytes
!=
null
)
{
return
resultBytes
;
}
if
(
sb
==
null
)
{
throw
new
IllegalStateException
(
"No interal StringBuffer"
);
}
if
(
resultBytes
==
null
)
{
resultBytes
=
sb
.
toString
().
getBytes
(
charSet
);
}
return
resultBytes
;
}
public
void
setResultBytes
(
byte
[]
resultBytes
)
{
this
.
resultBytes
=
resultBytes
;
}
/**
* Write the contents of the page to the client.
*
* @param os
* @throws IOException
*/
public
void
writeToOutputStream
(
OutputStream
os
)
throws
IOException
{
if
(
sb
==
null
)
{
throw
new
IllegalStateException
(
"No interal StringBuffer"
);
}
byte
[]
b
;
try
{
b
=
getBytes
();
}
catch
(
UnsupportedEncodingException
e
)
{
throw
new
RuntimeException
(
e
);
}
os
.
write
(
b
);
}
/**
* @param toInsert
*/
public
void
insertAtStartOfDocument
(
String
toInsert
)
{
sb
.
insert
(
0
,
toInsert
);
}
/**
* @param toInsert
*/
public
void
insertAtStartOfHead
(
String
toInsert
)
{
int
insertPoint
=
TagMagix
.
getEndOfFirstTag
(
sb
,
"head"
);
if
(-
1
==
insertPoint
)
{
insertPoint
=
0
;
}
sb
.
insert
(
insertPoint
,
toInsert
);
}
/**
* @param toInsert
*/
public
void
insertAtEndOfBody
(
String
toInsert
)
{
int
insertPoint
=
sb
.
lastIndexOf
(
"</body>"
);
if
(-
1
==
insertPoint
)
{
insertPoint
=
sb
.
lastIndexOf
(
"</BODY>"
);
}
if
(-
1
==
insertPoint
)
{
insertPoint
=
sb
.
length
();
}
sb
.
insert
(
insertPoint
,
toInsert
);
}
/**
* @param toInsert
*/
public
void
insertAtStartOfBody
(
String
toInsert
)
{
int
insertPoint
=
TagMagix
.
getEndOfFirstTag
(
sb
,
"body"
);
if
(-
1
==
insertPoint
)
{
insertPoint
=
0
;
}
sb
.
insert
(
insertPoint
,
toInsert
);
}
/**
* @param jspPath
* @param httpRequest
* @param httpResponse
* @param wbRequest
* @param results
* @return
* @throws IOException
* @throws ServletException
* @throws ParseException
*/
public
String
includeJspString
(
String
jspPath
,
HttpServletRequest
httpRequest
,
HttpServletResponse
httpResponse
,
WaybackRequest
wbRequest
,
CaptureSearchResults
results
,
CaptureSearchResult
result
,
Resource
resource
)
throws
ServletException
,
IOException
{
UIResults
uiResults
=
new
UIResults
(
wbRequest
,
uriConverter
,
results
,
result
,
resource
);
StringHttpServletResponseWrapper
wrappedResponse
=
new
StringHttpServletResponseWrapper
(
httpResponse
);
uiResults
.
forward
(
httpRequest
,
wrappedResponse
,
jspPath
);
return
wrappedResponse
.
getStringResponse
();
}
/**
* @param jsUrl
* @return
*/
public
String
getJSIncludeString
(
final
String
jsUrl
)
{
return
"<script type=\"text/javascript\" src=\""
+
jsUrl
+
"\" ></script>\n"
;
}
/**
* @return the charSet
*/
public
String
getCharSet
()
{
return
charSet
;
}
/**
* @param charSet the charSet to set
*/
public
void
setCharSet
(
String
charSet
)
{
this
.
charSet
=
charSet
;
}
private
class
SpecialResultURIConverter
implements
ResultURIConverter
{
private
static
final
String
EMAIL_PROTOCOL_PREFIX
=
"mailto:"
;
private
static
final
String
JAVASCRIPT_PROTOCOL_PREFIX
=
"javascript:"
;
private
ResultURIConverter
base
=
null
;
public
SpecialResultURIConverter
(
ResultURIConverter
base
)
{
this
.
base
=
base
;
}
public
String
makeReplayURI
(
String
datespec
,
String
url
)
{
//System.out.println("\ninside makeReplayURI " + datespec + " " + url + "\n");
if
(
url
.
startsWith
(
EMAIL_PROTOCOL_PREFIX
))
{
return
url
;
}
if
(
url
.
startsWith
(
JAVASCRIPT_PROTOCOL_PREFIX
))
{
return
url
;
}
//System.err.println(url);
//System.err.println(datespec);
//return base.makeReplayURI(datespec, url);
StringBuilder
sb
=
null
;
String
replayURIPrefix
=
null
;
if
(
replayURIPrefix
==
null
)
{
sb
=
new
StringBuilder
(
url
.
length
()
+
datespec
.
length
());
//sb.append(datespec);
//sb.append("http://localhost:8080/warcbase/servlet?date=");
//System.out.println("salam");
//sb.append(SERVER_PREFIX + "warcbase/servlet?date=");
sb
.
append
(
SERVER_PREFIX
+
"warcbase/servlet/"
);
sb
.
append
(
datespec
);
//sb.append("&query=");
sb
.
append
(
"/"
);
//sb.append("");
//sb.append(UrlOperations.stripDefaultPortFromUrl(url));
sb
.
append
(
url
);
//sb.append(URLEncoder.encode(UrlOperations.stripDefaultPortFromUrl(url), "US-ASCII"));
return
sb
.
toString
();
}
if
(
url
.
startsWith
(
replayURIPrefix
))
{
return
url
;
}
sb
=
new
StringBuilder
(
url
.
length
()
+
datespec
.
length
());
sb
.
append
(
replayURIPrefix
);
sb
.
append
(
datespec
);
sb
.
append
(
"/"
);
sb
.
append
(
UrlOperations
.
stripDefaultPortFromUrl
(
url
));
return
sb
.
toString
();
}
}
private
class
SpecialResultURIConverterEncoded
implements
ResultURIConverter
{
private
static
final
String
EMAIL_PROTOCOL_PREFIX
=
"mailto:"
;
private
static
final
String
JAVASCRIPT_PROTOCOL_PREFIX
=
"javascript:"
;
private
ResultURIConverter
base
=
null
;
public
SpecialResultURIConverterEncoded
(
ResultURIConverter
base
)
{
this
.
base
=
base
;
}
public
String
makeReplayURI
(
String
datespec
,
String
url
)
{
//System.out.println("\ninside makeReplayURI " + datespec + " " + url + "\n");
if
(
url
.
startsWith
(
EMAIL_PROTOCOL_PREFIX
))
{
return
url
;
}
if
(
url
.
startsWith
(
JAVASCRIPT_PROTOCOL_PREFIX
))
{
return
url
;
}
//System.err.println(url);
//System.err.println(datespec);
//return base.makeReplayURI(datespec, url);
StringBuilder
sb
=
null
;
String
replayURIPrefix
=
null
;
if
(
replayURIPrefix
==
null
)
{
sb
=
new
StringBuilder
(
url
.
length
()
+
datespec
.
length
());
//sb.append(datespec);
//sb.append("http://localhost:8080/warcbase/servlet?date=");
//System.out.println("salam");
//sb.append(SERVER_PREFIX + "warcbase/servlet?date=");
sb
.
append
(
SERVER_PREFIX
+
"warcbase/servlet/"
);
sb
.
append
(
datespec
);
//sb.append("&query=");
sb
.
append
(
"/"
);
//sb.append("");
//sb.append(UrlOperations.stripDefaultPortFromUrl(url));
/*try {
sb.append(URLEncoder.encode(UrlOperations.stripDefaultPortFromUrl(url.replaceAll("&", "&")), "US-ASCII"));
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
sb
.
append
(
url
);
return
sb
.
toString
();
}
if
(
url
.
startsWith
(
replayURIPrefix
))
{
return
url
;
}
sb
=
new
StringBuilder
(
url
.
length
()
+
datespec
.
length
());
sb
.
append
(
replayURIPrefix
);
sb
.
append
(
datespec
);
sb
.
append
(
"/"
);
sb
.
append
(
UrlOperations
.
stripDefaultPortFromUrl
(
url
));
return
sb
.
toString
();
}
}
private
class
MMSToHTTPResultURIConverter
implements
ResultURIConverter
{
private
static
final
String
MMS_PROTOCOL_PREFIX
=
"mms://"
;
private
static
final
String
HTTP_PROTOCOL_PREFIX
=
"http://"
;
private
ResultURIConverter
base
=
null
;
public
MMSToHTTPResultURIConverter
(
ResultURIConverter
base
)
{
this
.
base
=
base
;
}
public
String
makeReplayURI
(
String
datespec
,
String
url
)
{
if
(
url
.
startsWith
(
MMS_PROTOCOL_PREFIX
))
{
url
=
HTTP_PROTOCOL_PREFIX
+
url
.
substring
(
MMS_PROTOCOL_PREFIX
.
length
());
}
return
base
.
makeReplayURI
(
datespec
,
url
);
}
}
public
void
doJob
(){
BufferedReader
bReader
=
null
;
try
{
bReader
=
new
BufferedReader
(
new
FileReader
(
"/Users/milad/workspace/us_senate/test.html"
));
}
catch
(
FileNotFoundException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
String
line
=
null
;
StringBuilder
stringBuilder
=
new
StringBuilder
();
String
ls
=
System
.
getProperty
(
"line.separator"
);
try
{
while
(
(
line
=
bReader
.
readLine
()
)
!=
null
)
{
stringBuilder
.
append
(
line
);
stringBuilder
.
append
(
ls
);
}
}
catch
(
IOException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
//System.out.println(stringBuilder.toString());
sb
=
stringBuilder
;
//String pageUrl = "http://www.boxer.senate.gov/";
String
pageUrl
=
"http://www.ayotte.senate.gov/"
;
//String captureDate = "20130222204222";
String
captureDate
=
"2013-02-27T15:01:31Z"
;
String
existingBaseHref
=
TagMagix
.
getBaseHref
(
sb
);
if
(
existingBaseHref
!=
null
)
{
pageUrl
=
existingBaseHref
;
}
//ResultURIConverter uriConverter = null;
ResultURIConverter
uriConverter
=
new
ArchivalUrlResultURIConverter
();
/*ResultURIConverter uriConverter = new ResultURIConverter() {
@Override
public String makeReplayURI(String datespec, String url) {
// TODO Auto-generated method stub
return null;
}
};*/
ResultURIConverter
ruc
=
new
SpecialResultURIConverter
(
uriConverter
);
String
markups
[][]
=
{
{
"FRAME"
,
"SRC"
},
{
"META"
,
"URL"
},
{
"LINK"
,
"HREF"
},
{
"SCRIPT"
,
"SRC"
},
{
"IMG"
,
"SRC"
},
{
"A"
,
"HREF"
},
{
"AREA"
,
"HREF"
},
{
"OBJECT"
,
"CODEBASE"
},
{
"OBJECT"
,
"CDATA"
},
{
"APPLET"
,
"CODEBASE"
},
{
"APPLET"
,
"ARCHIVE"
},
{
"EMBED"
,
"SRC"
},
{
"IFRAME"
,
"SRC"
},
{
TagMagix
.
ANY_TAGNAME
,
"background"
}
};
for
(
String
tagAttr
[]
:
markups
)
{
TagMagix
.
markupTagREURIC
(
sb
,
ruc
,
captureDate
,
pageUrl
,
tagAttr
[
0
],
tagAttr
[
1
]);
}
TagMagix
.
markupCSSImports
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
TagMagix
.
markupStyleUrls
(
sb
,
uriConverter
,
captureDate
,
pageUrl
);
System
.
out
.
println
(
sb
);
}
public
String
fixURLs
(
String
content
,
String
pageUrl
,
String
captureDate
){
//System.out.println("inside fixURLs: " + pageUrl + " " + captureDate);
sb
=
new
StringBuilder
(
content
);
String
existingBaseHref
=
TagMagix
.
getBaseHref
(
sb
);
if
(
existingBaseHref
!=
null
)
{
pageUrl
=
existingBaseHref
;
}
//System.out.println(existingBaseHref);
//ResultURIConverter uriConverter = null;
ResultURIConverter
uriConverter
=
new
ArchivalUrlResultURIConverter
();
/*ResultURIConverter uriConverter = new ResultURIConverter() {
@Override
public String makeReplayURI(String datespec, String url) {
// TODO Auto-generated method stub
return null;
}
};*/
ResultURIConverter
rucEncoded
=
new
SpecialResultURIConverterEncoded
(
uriConverter
);
String
markups
[][]
=
{
{
"FRAME"
,
"SRC"
},
{
"META"
,
"URL"
},
{
"LINK"
,
"HREF"
},
{
"SCRIPT"
,
"SRC"
},
{
"IMG"
,
"SRC"
},
{
"INPUT"
,
"SRC"
},
{
"A"
,
"HREF"
},
{
"AREA"
,
"HREF"
},
{
"OBJECT"
,
"CODEBASE"
},
{
"OBJECT"
,
"CDATA"
},
{
"APPLET"
,
"CODEBASE"
},
{
"APPLET"
,
"ARCHIVE"
},
{
"EMBED"
,
"SRC"
},
{
"IFRAME"
,
"SRC"
},
{
TagMagix
.
ANY_TAGNAME
,
"background"
}
};
for
(
String
tagAttr
[]
:
markups
)
{
TagMagix
.
markupTagREURIC
(
sb
,
rucEncoded
,
captureDate
,
pageUrl
,
tagAttr
[
0
],
tagAttr
[
1
]);
}
ResultURIConverter
ruc
=
new
SpecialResultURIConverter
(
uriConverter
);
TagMagix
.
markupCSSImports
(
sb
,
ruc
,
captureDate
,
pageUrl
.
replaceAll
(
"&"
,
"&"
));
TagMagix
.
markupStyleUrls
(
sb
,
ruc
,
captureDate
,
pageUrl
.
replaceAll
(
"&"
,
"&"
));
//System.out.println(sb.toString());
return
sb
.
toString
();
}
public
static
void
main
(
String
[]
args
)
{
TextDocument2
t2
=
new
TextDocument2
(
null
,
null
,
null
);
t2
.
doJob
();
}
}
Event Timeline
Log In to Comment