Home » Developer & Programmer » Precompilers, OCI & OCCI » extproc OCI LOB Question (11g2 win32)
extproc OCI LOB Question [message #475196] |
Mon, 13 September 2010 12:04 |
cikic
Messages: 12 Registered: September 2006 Location: Austria
|
Junior Member |
|
|
Hello!
I have never done something with C and would need some help. I need to bind libtidy tidy.sourceforge.net to oracle and I was able to compile a running example with char* and varchar2. But in real case I need to use clob, and I have no idea how to do so - since I was not able to find helpful documents on web.
Can you help me out (see comments):
//test3.dll
#include "tidy.h"
#include "buffio.h"
#include <stdio.h>
#include <errno.h>
#include <oci.h>
#include <ociextp.h>
void parseTidy(
OCIExtProcContext *ctx
,OCILobLocator *clobinput
,int *rc
,OCILobLocator **cloboutxml
,OCILobLocator **clobouterr
/* ... alle options ... */
)
{
// Need help with this one:
//char *input = OCILobRead "<title>Foo</title><p>Foo!";
char *input = "<title>Foo</title><p>Foo!"; // just to test
TidyBuffer output = {0};
TidyBuffer errbuf = {0};
Bool ok;
TidyDoc tdoc = tidyCreate(); // Initialize "document"
/* set options */
ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML
*rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
if ( *rc >= 0 )
*rc = tidyParseString( tdoc, input ); // Parse the input
if ( *rc >= 0 )
*rc = tidyCleanAndRepair( tdoc ); // Tidy it up!
if ( *rc >= 0 )
*rc = tidyRunDiagnostics( tdoc ); // Kvetch
if ( *rc > 1 ) // If error, force output.
*rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
if ( *rc >= 0 )
*rc = tidySaveBuffer( tdoc, &output ); // Pretty Print
// And need help with this one:
//clobouterr = errbuf.bp
//cloboutxml = output.bp
tidyBufFree( &output );
tidyBufFree( &errbuf );
tidyRelease( tdoc );
}
SQL+
create library TEST3_LIB is 'C:\app\XPMUser\product\11.2.0\dbhome_1\BIN\test3.dll';
/
set serveroutput on
declare
o_xml clob := ' ';
o_err clob := ' ';
rc pls_integer := -1;
procedure test(
i_xml IN CLOB
,o_rc OUT PLS_INTEGER
,o_xml IN OUT CLOB
,o_err IN OUT CLOB
)
AS LANGUAGE C
NAME "parseTidy"
LIBRARY test3_lib
WITH CONTEXT
PARAMETERS (
CONTEXT
,i_xml
,o_rc BY REFERENCE
,o_xml BY REFERENCE
,o_err BY REFERENCE
);
begin
test('<title>Foo</title><p>Foo!',rc,o_xml,o_err);
dbms_output.put_line(rc);
rollback;
end;
/
Thank you very much!
Christian
|
|
|
|
Re: extproc OCI LOB Question [message #475198 is a reply to message #475197] |
Mon, 13 September 2010 12:24 |
cikic
Messages: 12 Registered: September 2006 Location: Austria
|
Junior Member |
|
|
On the tidyside everythink is fine. I have actually no clue of how to read write to a clob in C language.
How do I have to do this - not working pseudo code
char *input = OCILobRead(clobinput);
and how do I have to write to a clob:
OCILobRead(*clobouterr, errbuf.bp); // errbuf.bp is type of byte*
Thanks
|
|
|
|
|
|
|
Re: extproc OCI LOB Question [message #475207 is a reply to message #475202] |
Mon, 13 September 2010 13:25 |
cikic
Messages: 12 Registered: September 2006 Location: Austria
|
Junior Member |
|
|
Wow I did not expect you do this via plan vanilla plsql. As far as I read your posted code I am not sure if this can hanlde malformed html/xml like i.e. "<title>Foo</title><p>Foo!" -> end tag missing.
a quick test shows not
set def off
set serveroutput on
declare
l_page clob := '<title>Foo</title><p>Foo!';
procedure normalize
--Normalize the l_page content to be a simple "TABLE" XML page
is
begin
-- Replace any contiguous space string by a single space
l_page := regexp_replace(l_page, '[[:space:]]+', ' ');
-- Replace " " string by a space (Oracle seems to not like " ")
l_page := replace(l_page, ' ', ' ');
-- Remove IMG element (any character case)
l_page := regexp_replace(l_page, '<IMG[^>]+>', '', 1, 0, 'i');
-- Remove all attributes in tags
l_page := regexp_replace(l_page, '[[:alpha:]]+=[^>]+', '');
-- Remove <SUP> parts (references to footnote in page) (any character case)
l_page := regexp_replace(l_page, '<SUP *>[[:digit:]]+</SUP *>', '', 1, 0, 'i');
-- Remove <A> tags (any character case)
l_page := regexp_replace(l_page, '<[/]{0,1}A *>', '', 1, 0, 'i');
-- Remove <DIV> tags (any character case)
l_page := regexp_replace(l_page, '<[/]{0,1}DIV *>', '', 1, 0, 'i');
-- Put remaining tags in upper case as Oracle XML query is case sensitive
l_page := replace(l_page, '</table>', '</TABLE>');
l_page := replace(l_page, '<tbody>', '<TBODY>');
l_page := replace(l_page, '</tbody>', '</TBODY>');
l_page := regexp_replace(l_page, 'td *>', 'TD>');
l_page := regexp_replace(l_page, 'tr *>', 'TR>');
end;
begin
normalize;
dbms_output.put_line(l_page);
end;
/
ok no suprise here since I can see in the code just some regexp replaces.
Good way, what is missing is: that I receive malformed xml(html) and btw I cannot stripe tags for webscraping purposes. Is there a built in Oracle tool to "tidy" malformed xml? Rewrite libtidy in pl/sql seems to workloaded to me ...
Many thanks your link didnt solve my actual problem but helped me a lot on other issues! Do you have more ideas how to clean malformed xml?
Chris
|
|
|
Re: extproc OCI LOB Question [message #475208 is a reply to message #475207] |
Mon, 13 September 2010 13:35 |
|
Michel Cadot
Messages: 68732 Registered: March 2007 Location: Saint-Maur, France, https...
|
Senior Member Account Moderator |
|
|
This code was specific to Littlefoot's case.
I don't know tidy but depending on your case I think you can do it in PL/SQL with not too much effort.
Of course if you want to gobble ANY html and transform it to xml it could be much harder, html and xml are very different and have different purpose.
Regards
Michel
[Updated on: Mon, 13 September 2010 13:37] Report message to a moderator
|
|
|
|
|
|
|
Goto Forum:
Current Time: Sun Feb 02 08:20:24 CST 2025
|