[Xapian-discuss] docx support

Colin Bell colinabell at gmail.com
Thu Jul 24 09:08:24 BST 2008


This is how I do it using tinyxml parser. My xml parsing may be a bit  
convoluted but it works. This can be applied for powerpoint and excel  
too.

...
	mime_map["docx"] = "application/vnd.openxmlformats- 
officedocument.wordprocessingml.document";
	mime_map["pptx"] = "application/vnd.openxmlformats- 
officedocument.presentationml.presentation";
	mime_map["xlsx"] = "application/vnd.openxmlformats- 
officedocument.spreadsheetml.sheet";

...

//HANDLE DOCX WORD DOCUMENTS
	if (mimetype == "application/vnd.openxmlformats- 
officedocument.wordprocessingml.document"){
		string cmd = "unzip -p " + shell_protect(filepath) + " docProps/ 
core.xml";
		fileData+=parseWordXMetaData(mstdout_to_string(cmd));
		cmd = "unzip -p " + shell_protect(filepath) + " docProps/app.xml";
		fFileData+=parseWordXMetaData(mstdout_to_string(cmd));
		cmd = "unzip -p " + shell_protect(filepath) + " docProps/custom.xml";
		fileData+=parseWordXCustomMetaData(mstdout_to_string(cmd));
		cmd = "unzip -p " + shell_protect(filepath) + " word/document.xml";
		try{
			XmlParser xmlparser;
			xmlparser.parse_html(mstdout_to_string(cmd));
			dump = xmlparser.dump;
		} catch (ReadError) {
			cout << "\"" << cmd << "\" failed - skipping\n";
			return 0;
		}
	}

string parseWordXCustomMetaData(string xml){
	string fileData = "";
	TiXmlDocument doc;
	doc.Parse((char *) xml.c_str());
	TiXmlElement* root = doc.RootElement();
	if(root){
		TiXmlNode * pParent = root->FirstChild();
		if(pParent){
			TiXmlNode * pChild = root->IterateChildren(pParent);
			for (pChild = pParent; pChild != 0; pChild = pChild->NextSibling()){
				if(pChild){
					TiXmlElement* aElem = pChild->ToElement();
					if(aElem){
						string name = aElem->Attribute("name");
						TiXmlNode * pProperty = aElem->FirstChild();
						if(pProperty){
							TiXmlNode * pPropertyChild = aElem->IterateChildren(pProperty);
							for (pPropertyChild = pProperty; pPropertyChild != 0;  
pPropertyChild = pPropertyChild->NextSibling()){
								if(pPropertyChild){
									TiXmlElement* bElem = pPropertyChild->ToElement();
									if(bElem->GetText()){
										fileData+= "name:" + name + "=\"" + bElem->GetText() + "\"\n";
									}
								}
							}
						}
					}
				}
			}
		}
	}
	return fileData;
}

Easy peasy ;-)

On 23 Jul 2008, at 19:38, Frank Bruzzaniti wrote:

> Is office 2007 formats like docx supported?
>
> Is there anyway to get xapian to index office 2007 formats?
>
> Is there any option/procedure to add a new mime plugin?
> For example if you rename a docx .zip you can retrieve text from
> document.xml
>
> Thanks
>
> Frank
>
> _______________________________________________
> Xapian-discuss mailing list
> Xapian-discuss at lists.xapian.org
> http://lists.xapian.org/mailman/listinfo/xapian-discuss



More information about the Xapian-discuss mailing list